Use grants at MDT for Data-on-MDT needs.
Add parameter to reserve part of available space
for metadata and never grants it to clients.
Test-Parameters: mdssizegb=20 testlist=dom-performance
Signed-off-by: Mikhal Pershin <mike.pershin@intel.com>
Change-Id: I2612352062871e4edd3817f32e7d96cb95a0a52b
Reviewed-on: https://review.whamcloud.com/28021
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Reviewed-by: Jinshan Xiong <jinshan.xiong@intel.com>
u64 tgd_tot_granted;
/* grant used by I/Os in progress (between prepare and commit) */
u64 tgd_tot_pending;
+ /* amount of available space in percentage that is never used for
+ * grants, used on MDT to always keep space for metadata. */
+ u64 tgd_reserved_pcnt;
/* number of clients using grants */
int tgd_tot_granted_clients;
/* shall we grant space to clients not
int tgt_statfs_internal(const struct lu_env *env, struct lu_target *lut,
struct obd_statfs *osfs, __u64 max_age,
int *from_cache);
+int tgt_tot_dirty_seq_show(struct seq_file *m, void *data);
+int tgt_tot_granted_seq_show(struct seq_file *m, void *data);
+int tgt_tot_pending_seq_show(struct seq_file *m, void *data);
+int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data);
+ssize_t tgt_grant_compat_disable_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off);
/* target/update_trans.c */
int distribute_txn_init(const struct lu_env *env,
struct ptlrpc_request_set *set);
int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
struct hlist_node *hnode, void *arg);
-
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+ struct obd_device *obd, struct obd_uuid *cluuid,
+ struct obd_connect_data *data, void *localdata);
+int osc_disconnect(struct obd_export *exp);
int osc_punch_send(struct obd_export *exp, struct obdo *oa,
obd_enqueue_update_f upcall, void *cookie);
OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM | \
OBD_CONNECT_MULTIMODRPCS | \
OBD_CONNECT_SUBTREE | OBD_CONNECT_LARGE_ACL | \
+ OBD_CONNECT_GRANT_PARAM | \
OBD_CONNECT_FLAGS2)
#define MDT_CONNECT_SUPPORTED2 OBD_CONNECT2_FILE_SECCTX
RETURN(-ENOMEM);
}
+ /* pass client page size via ocd_grant_blkbits, the server should report
+ * back its backend blocksize for grant calculation purpose */
+ data->ocd_grant_blkbits = PAGE_SHIFT;
+
/* indicate MDT features supported by this client */
data->ocd_connect_flags = OBD_CONNECT_IBITS | OBD_CONNECT_NODEVOH |
OBD_CONNECT_ATTRFID | OBD_CONNECT_GRANT |
OBD_CONNECT_DIR_STRIPE |
OBD_CONNECT_BULK_MBITS | OBD_CONNECT_CKSUM |
OBD_CONNECT_SUBTREE |
- OBD_CONNECT_FLAGS2 | OBD_CONNECT_MULTIMODRPCS;
+ OBD_CONNECT_MULTIMODRPCS |
+ OBD_CONNECT_GRANT_PARAM | OBD_CONNECT_FLAGS2;
data->ocd_connect_flags2 = 0;
.o_add_conn = client_import_add_conn,
.o_del_conn = client_import_del_conn,
.o_connect = client_connect_import,
- .o_disconnect = client_disconnect_export,
+ .o_reconnect = osc_reconnect,
+ .o_disconnect = osc_disconnect,
.o_iocontrol = mdc_iocontrol,
.o_set_info_async = mdc_set_info_async,
.o_statfs = mdc_statfs,
{
struct ptlrpc_request *req = tgt_ses_req(tsi);
struct mdt_thread_info *info = tsi2mdt_info(tsi);
- struct md_device *next = info->mti_mdt->mdt_child;
+ struct mdt_device *mdt = info->mti_mdt;
+ struct tg_grants_data *tgd = &mdt->mdt_lut.lut_tgd;
struct ptlrpc_service_part *svcpt;
struct obd_statfs *osfs;
int rc;
if (!osfs)
GOTO(out, rc = -EPROTO);
- /** statfs information are cached in the mdt_device */
- if (cfs_time_before_64(info->mti_mdt->mdt_osfs_age,
- cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS))) {
- /** statfs data is too old, get up-to-date one */
- rc = next->md_ops->mdo_statfs(info->mti_env, next, osfs);
- if (rc)
- GOTO(out, rc);
- spin_lock(&info->mti_mdt->mdt_lock);
- info->mti_mdt->mdt_osfs = *osfs;
- info->mti_mdt->mdt_osfs_age = cfs_time_current_64();
- spin_unlock(&info->mti_mdt->mdt_lock);
- } else {
- /** use cached statfs data */
- spin_lock(&info->mti_mdt->mdt_lock);
- *osfs = info->mti_mdt->mdt_osfs;
- spin_unlock(&info->mti_mdt->mdt_lock);
- }
+ rc = tgt_statfs_internal(tsi->tsi_env, &mdt->mdt_lut, osfs,
+ cfs_time_shift_64(-OBD_STATFS_CACHE_SECONDS),
+ NULL);
+ if (unlikely(rc))
+ GOTO(out, rc);
+ /* at least try to account for cached pages. its still racy and
+ * might be under-reporting if clients haven't announced their
+ * caches with brw recently */
+ CDEBUG(D_SUPER | D_CACHE, "blocks cached %llu granted %llu"
+ " pending %llu free %llu avail %llu\n",
+ tgd->tgd_tot_dirty, tgd->tgd_tot_granted,
+ tgd->tgd_tot_pending,
+ osfs->os_bfree << tgd->tgd_blockbits,
+ osfs->os_bavail << tgd->tgd_blockbits);
+
+ osfs->os_bavail -= min_t(u64, osfs->os_bavail,
+ ((tgd->tgd_tot_dirty + tgd->tgd_tot_pending +
+ osfs->os_bsize - 1) >> tgd->tgd_blockbits));
+
+ tgt_grant_sanity_check(mdt->mdt_lu_dev.ld_obd, __func__);
+ CDEBUG(D_CACHE, "%llu blocks: %llu free, %llu avail; "
+ "%llu objects: %llu free; state %x\n",
+ osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
+ osfs->os_files, osfs->os_ffree, osfs->os_state);
+
+ if (!exp_grant_param_supp(tsi->tsi_exp) &&
+ tgd->tgd_blockbits > COMPAT_BSIZE_SHIFT) {
+ /* clients which don't support OBD_CONNECT_GRANT_PARAM
+ * should not see a block size > page size, otherwise
+ * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12)
+ * block size which is the biggest block size known to work
+ * with all client's page size. */
+ osfs->os_blocks <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+ osfs->os_bfree <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+ osfs->os_bavail <<= tgd->tgd_blockbits - COMPAT_BSIZE_SHIFT;
+ osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
+ }
if (rc == 0)
mdt_counter_incr(req, LPROC_MDT_STATFS);
out:
static int mdt_init0(const struct lu_env *env, struct mdt_device *m,
struct lu_device_type *ldt, struct lustre_cfg *cfg)
{
- struct mdt_thread_info *info;
- struct obd_device *obd;
+ struct mdt_thread_info *info;
+ struct obd_device *obd;
+ struct tg_grants_data *tgd = &m->mdt_lut.lut_tgd;
const char *dev = lustre_cfg_string(cfg, 0);
const char *num = lustre_cfg_string(cfg, 2);
struct lustre_mount_info *lmi = NULL;
INIT_LIST_HEAD(&m->mdt_squash.rsi_nosquash_nids);
init_rwsem(&m->mdt_squash.rsi_sem);
spin_lock_init(&m->mdt_lock);
- m->mdt_osfs_age = cfs_time_shift_64(-1000);
m->mdt_enable_remote_dir = 0;
m->mdt_enable_remote_dir_gid = 0;
if (rc)
GOTO(err_free_hsm, rc);
+ /* Amount of available space excluded from granting and reserved
+ * for metadata. It is in percentage and 50% is default value. */
+ tgd->tgd_reserved_pcnt = 50;
+
+ if (ONE_MB_BRW_SIZE < (1U << tgd->tgd_blockbits))
+ m->mdt_brw_size = 1U << tgd->tgd_blockbits;
+ else
+ m->mdt_brw_size = ONE_MB_BRW_SIZE;
+
rc = mdt_fs_setup(env, m, obd, lsi);
if (rc)
GOTO(err_tgt, rc);
data->ocd_connect_flags &= ~OBD_CONNECT_XATTR;
if (OCD_HAS_FLAG(data, BRW_SIZE)) {
- data->ocd_brw_size = min(data->ocd_brw_size, MD_MAX_BRW_SIZE);
+ data->ocd_brw_size = min(data->ocd_brw_size,
+ mdt->mdt_brw_size);
if (data->ocd_brw_size == 0) {
CERROR("%s: cli %s/%p ocd_connect_flags: %#llx "
"ocd_version: %x ocd_grant: %d ocd_index: %u "
}
}
- if (OCD_HAS_FLAG(data, GRANT))
- data->ocd_grant = mdt_grant_connect(env, exp, data->ocd_grant,
- !reconnect);
+ if (OCD_HAS_FLAG(data, GRANT_PARAM)) {
+ struct dt_device_param *ddp = &mdt->mdt_lut.lut_dt_conf;
+
+ /* client is reporting its page size, for future use */
+ exp->exp_target_data.ted_pagebits = data->ocd_grant_blkbits;
+ data->ocd_grant_blkbits = mdt->mdt_lut.lut_tgd.tgd_blockbits;
+ /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs
+ * it's LDISKFS_DIR_REC_LEN(20) = 28. */
+ data->ocd_grant_inobits = fls(ddp->ddp_inodespace - 1);
+ /* ocd_grant_tax_kb is in 1K byte blocks */
+ data->ocd_grant_tax_kb = ddp->ddp_extent_tax >> 10;
+ data->ocd_grant_max_blks = ddp->ddp_max_extent_blks;
+ }
+
+ if (OCD_HAS_FLAG(data, GRANT)) {
+ /* Save connect_data we have so far because tgt_grant_connect()
+ * uses it to calculate grant. */
+ exp->exp_connect_data = *data;
+ tgt_grant_connect(env, exp, data, !reconnect);
+ }
+
+ if (OCD_HAS_FLAG(data, MAXBYTES))
+ data->ocd_maxbytes = mdt->mdt_lut.lut_dt_conf.ddp_maxbytes;
/* NB: Disregard the rule against updating
* exp_connect_data.ocd_connect_flags in this case, since
static int mdt_obd_disconnect(struct obd_export *exp)
{
- int rc;
- ENTRY;
+ int rc;
+
+ ENTRY;
- LASSERT(exp);
- class_export_get(exp);
+ LASSERT(exp);
+ class_export_get(exp);
+
+ if (!(exp->exp_flags & OBD_OPT_FORCE))
+ tgt_grant_sanity_check(exp->exp_obd, __func__);
if ((exp_connect_flags(exp) & OBD_CONNECT_MDS_MDS) &&
!(exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT)) {
if (rc != 0)
CDEBUG(D_IOCTL, "server disconnect error: rc = %d\n", rc);
+ tgt_grant_discard(exp);
+
rc = mdt_export_cleanup(exp);
nodemap_del_member(exp);
class_export_put(exp);
LASSERT(list_empty(&exp->exp_outstanding_replies));
LASSERT(list_empty(&exp->exp_mdt_data.med_open_head));
+ /*
+ * discard grants once we're sure no more
+ * interaction with the client is possible
+ */
+ tgt_grant_discard(exp);
+ if (exp_connect_flags(exp) & OBD_CONNECT_GRANT)
+ exp->exp_obd->u.obt.obt_lut->lut_tgd.tgd_tot_granted_clients--;
+
+ if (!(exp->exp_flags & OBD_OPT_FORCE))
+ tgt_grant_sanity_check(exp->exp_obd, __func__);
+
RETURN(0);
}
int mdt_max_ea_size;
+ /* preferred BRW size, decided by storage type and capability */
+ __u32 mdt_brw_size;
+
struct upcall_cache *mdt_identity_cache;
unsigned int mdt_capa_conf:1,
/* lock for osfs and md_root */
spinlock_t mdt_lock;
- /* statfs optimization: we cache a bit */
- struct obd_statfs mdt_osfs;
- __u64 mdt_osfs_age;
-
/* root squash */
struct root_squash_info mdt_squash;
#include <dt_object.h>
#include "mdt_internal.h"
-/* --------------- MDT grant code ---------------- */
-
-long mdt_grant_connect(const struct lu_env *env,
- struct obd_export *exp,
- u64 want, bool conservative)
-{
- struct mdt_device *mdt = mdt_exp2dev(exp);
- u64 left;
- long grant;
-
- ENTRY;
-
- dt_statfs(env, mdt->mdt_bottom, &mdt->mdt_osfs);
-
- left = (mdt->mdt_osfs.os_bavail * mdt->mdt_osfs.os_bsize) / 2;
-
- grant = left;
-
- CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %ld want: %llu left: %llu\n",
- exp->exp_obd->obd_name, exp->exp_client_uuid.uuid,
- exp, grant, want, left);
-
- return grant;
-}
-
-void mdt_grant_prepare_write(const struct lu_env *env,
- struct obd_export *exp, struct obdo *oa,
- struct niobuf_remote *rnb, int niocount)
-{
- struct mdt_device *mdt = mdt_exp2dev(exp);
- u64 left;
-
- ENTRY;
-
- left = (mdt->mdt_osfs.os_bavail * mdt->mdt_osfs.os_bsize) / 2;
-
- /* grant more space back to the client if possible */
- oa->o_grant = left;
-}
-/* ---------------- end of MDT grant code ---------------- */
-
/* functions below are stubs for now, they will be implemented with
* grant support on MDT */
static inline void mdt_io_counter_incr(struct obd_export *exp, int opcode,
return;
}
-void mdt_grant_prepare_read(const struct lu_env *env,
- struct obd_export *exp, struct obdo *oa)
-{
- return;
-}
-
-void mdt_grant_commit(struct obd_export *exp, unsigned long pending,
- int rc)
-{
- return;
-
-}
-
static inline void mdt_dom_read_lock(struct mdt_object *mo)
{
down_read(&mo->mot_dom_sem);
/* Process incoming grant info, set OBD_BRW_GRANTED flag and grant some
* space back if possible */
- mdt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt);
+ tgt_grant_prepare_write(env, exp, oa, rnb, obj->ioo_bufcnt);
mdt_dom_read_lock(mo);
if (!mdt_object_exists(mo)) {
if (unlikely(rc < 0))
GOTO(err, rc);
/* correct index for local buffers to continue with */
- for (k = 0; k < rc; k++)
- lnb[j+k].lnb_flags = rnb[i].rnb_flags;
+ for (k = 0; k < rc; k++) {
+ lnb[j + k].lnb_flags = rnb[i].rnb_flags;
+ if (!(rnb[i].rnb_flags & OBD_BRW_GRANTED))
+ lnb[j + k].lnb_rc = -ENOSPC;
+ }
j += rc;
*nr_local += rc;
tot_bytes += rnb[i].rnb_len;
unlock:
mdt_dom_read_unlock(mo);
/* tgt_grant_prepare_write() was called, so we must commit */
- mdt_grant_commit(exp, oa->o_grant_used, rc);
+ tgt_grant_commit(exp, oa->o_grant_used, rc);
/* let's still process incoming grant information packed in the oa,
* but without enforcing grant since we won't proceed with the write.
* Just like a read request actually. */
- mdt_grant_prepare_read(env, exp, oa);
+ tgt_grant_prepare_read(env, exp, oa);
return rc;
}
objcount, obj, rnb, nr_local, lnb,
jobid);
} else if (cmd == OBD_BRW_READ) {
- mdt_grant_prepare_read(env, exp, oa);
+ tgt_grant_prepare_read(env, exp, oa);
rc = mdt_preprw_read(env, exp, mdt, mo, la,
obj->ioo_bufcnt, rnb, nr_local, lnb,
jobid);
if (rc == -ENOSPC)
th->th_sync = 1;
+
+ if (rc == 0 && granted > 0) {
+ if (tgt_grant_commit_cb_add(th, exp, granted) == 0)
+ granted = 0;
+ }
+
th->th_result = rc;
dt_trans_stop(env, dt, th);
if (rc == -ENOSPC && retries++ < 3) {
out:
dt_bufs_put(env, dob, lnb, niocount);
mdt_dom_read_unlock(mo);
- mdt_grant_commit(exp, granted, old_rc);
+ if (granted > 0)
+ tgt_grant_commit(exp, granted, old_rc);
RETURN(rc);
}
LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_hard);
LPROC_SEQ_FOPS_RW_TYPE(mdt, recovery_time_soft);
+LPROC_SEQ_FOPS_RO(tgt_tot_dirty);
+LPROC_SEQ_FOPS_RO(tgt_tot_granted);
+LPROC_SEQ_FOPS_RO(tgt_tot_pending);
+LPROC_SEQ_FOPS(tgt_grant_compat_disable);
+
static struct lprocfs_vars lprocfs_mdt_obd_vars[] = {
+ { .name = "tot_dirty",
+ .fops = &tgt_tot_dirty_fops },
+ { .name = "tot_pending",
+ .fops = &tgt_tot_pending_fops },
+ { .name = "tot_granted",
+ .fops = &tgt_tot_granted_fops },
+ { .name = "grant_compat_disable",
+ .fops = &tgt_grant_compat_disable_fops },
{ .name = "recovery_status",
.fops = &mdt_recovery_status_fops },
{ .name = "num_exports",
LPROC_SEQ_FOPS_RO(ofd_seqs);
/**
- * Show estimate of total amount of dirty data on clients.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
-static int ofd_tot_dirty_seq_show(struct seq_file *m, void *data)
-{
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd;
-
- LASSERT(obd != NULL);
- tgd = &obd->u.obt.obt_lut->lut_tgd;
- seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
- return 0;
-}
-LPROC_SEQ_FOPS_RO(ofd_tot_dirty);
-
-/**
- * Show total amount of space granted to clients.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
-static int ofd_tot_granted_seq_show(struct seq_file *m, void *data)
-{
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd;
-
- LASSERT(obd != NULL);
- tgd = &obd->u.obt.obt_lut->lut_tgd;
- seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
- return 0;
-}
-LPROC_SEQ_FOPS_RO(ofd_tot_granted);
-
-/**
- * Show total amount of space used by IO in progress.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
-static int ofd_tot_pending_seq_show(struct seq_file *m, void *data)
-{
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd;
-
- LASSERT(obd != NULL);
- tgd = &obd->u.obt.obt_lut->lut_tgd;
- seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
- return 0;
-}
-LPROC_SEQ_FOPS_RO(ofd_tot_pending);
-
-/**
* Show total number of grants for precreate.
*
* \param[in] m seq_file handle
LPROC_SEQ_FOPS(ofd_sync_lock_cancel);
/**
- * Show if grants compatibility mode is disabled.
- *
- * When tgd_grant_compat_disable is set, we don't grant any space to clients
- * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
- * a client is inflated since it consumes PAGE_SIZE of grant space per
- * block, (i.e. typically 4kB units), but underlaying file system might have
- * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
- *
- * \param[in] m seq_file handle
- * \param[in] data unused for single entry
- *
- * \retval 0 on success
- * \retval negative value on error
- */
-static int ofd_grant_compat_disable_seq_show(struct seq_file *m, void *data)
-{
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
-
- seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
- return 0;
-}
-
-/**
- * Change grant compatibility mode.
- *
- * Setting tgd_grant_compat_disable prohibit any space granting to clients
- * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
- *
- * \param[in] file proc file
- * \param[in] buffer string which represents mode
- * 1: disable compatibility mode
- * 0: enable compatibility mode
- * \param[in] count \a buffer length
- * \param[in] off unused for single entry
- *
- * \retval \a count on success
- * \retval negative number on error
- */
-static ssize_t
-ofd_grant_compat_disable_seq_write(struct file *file,
- const char __user *buffer,
- size_t count, loff_t *off)
-{
- struct seq_file *m = file->private_data;
- struct obd_device *obd = m->private;
- struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
- __s64 val;
- int rc;
-
- rc = lprocfs_str_to_s64(buffer, count, &val);
- if (rc)
- return rc;
-
- if (val < 0)
- return -EINVAL;
-
- tgd->tgd_grant_compat_disable = !!val;
-
- return count;
-}
-LPROC_SEQ_FOPS(ofd_grant_compat_disable);
-
-/**
* Show the limit of soft sync RPCs.
*
* This value defines how many IO RPCs with OBD_BRW_SOFT_SYNC flag
LPROC_SEQ_FOPS_RW_TYPE(ofd, checksum_dump);
LPROC_SEQ_FOPS_RW_TYPE(ofd, job_interval);
+LPROC_SEQ_FOPS_RO(tgt_tot_dirty);
+LPROC_SEQ_FOPS_RO(tgt_tot_granted);
+LPROC_SEQ_FOPS_RO(tgt_tot_pending);
+LPROC_SEQ_FOPS(tgt_grant_compat_disable);
+
struct lprocfs_vars lprocfs_ofd_obd_vars[] = {
{ .name = "seqs_allocated",
.fops = &ofd_seqs_fops },
{ .name = "last_id",
.fops = &ofd_last_id_fops },
{ .name = "tot_dirty",
- .fops = &ofd_tot_dirty_fops },
+ .fops = &tgt_tot_dirty_fops },
{ .name = "tot_pending",
- .fops = &ofd_tot_pending_fops },
+ .fops = &tgt_tot_pending_fops },
{ .name = "tot_granted",
- .fops = &ofd_tot_granted_fops },
+ .fops = &tgt_tot_granted_fops },
{ .name = "grant_precreate",
.fops = &ofd_grant_precreate_fops },
{ .name = "precreate_batch",
{ .name = "checksum_dump",
.fops = &ofd_checksum_dump_fops },
{ .name = "grant_compat_disable",
- .fops = &ofd_grant_compat_disable_fops },
+ .fops = &tgt_grant_compat_disable_fops },
{ .name = "client_cache_count",
.fops = &ofd_fmd_max_num_fops },
{ .name = "client_cache_seconds",
struct ofd_thread_info *info = NULL;
struct obd_device *obd;
struct tg_grants_data *tgd = &m->ofd_lut.lut_tgd;
- struct obd_statfs *osfs;
struct lu_fid fid;
struct nm_config_file *nodemap_config;
struct obd_device_target *obt;
m->ofd_raid_degraded = 0;
m->ofd_syncjournal = 0;
ofd_slc_set(m);
- tgd->tgd_grant_compat_disable = 0;
m->ofd_soft_sync_limit = OFD_SOFT_SYNC_LIMIT_DEFAULT;
- /* statfs data */
- spin_lock_init(&tgd->tgd_osfs_lock);
- tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
- tgd->tgd_osfs_unstable = 0;
- tgd->tgd_statfs_inflight = 0;
- tgd->tgd_osfs_inflight = 0;
-
- /* grant data */
- spin_lock_init(&tgd->tgd_grant_lock);
- tgd->tgd_tot_dirty = 0;
- tgd->tgd_tot_granted = 0;
- tgd->tgd_tot_pending = 0;
-
m->ofd_seq_count = 0;
init_waitqueue_head(&m->ofd_inconsistency_thread.t_ctl_waitq);
INIT_LIST_HEAD(&m->ofd_inconsistency_list);
ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL,
"filter_ldlm_cb_client", &obd->obd_ldlm_client);
- dt_conf_get(env, m->ofd_osd, &m->ofd_lut.lut_dt_conf);
-
rc = tgt_init(env, &m->ofd_lut, obd, m->ofd_osd, ofd_common_slice,
OBD_FAIL_OST_ALL_REQUEST_NET,
OBD_FAIL_OST_ALL_REPLY_NET);
if (rc)
GOTO(err_free_ns, rc);
- /* populate cached statfs data */
- osfs = &ofd_info(env)->fti_u.osfs;
- rc = tgt_statfs_internal(env, &m->ofd_lut, osfs, 0, NULL);
- if (rc != 0) {
- CERROR("%s: can't get statfs data, rc %d\n", obd->obd_name, rc);
- GOTO(err_fini_lut, rc);
- }
- if (!is_power_of_2(osfs->os_bsize)) {
- CERROR("%s: blocksize (%d) is not a power of 2\n",
- obd->obd_name, osfs->os_bsize);
- GOTO(err_fini_lut, rc = -EPROTO);
- }
- tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+ tgd->tgd_reserved_pcnt = 0;
if (DT_DEF_BRW_SIZE < (1U << tgd->tgd_blockbits))
m->ofd_brw_size = 1U << tgd->tgd_blockbits;
m->ofd_cksum_types_supported = cksum_types_supported_server();
m->ofd_precreate_batch = OFD_PRECREATE_BATCH_DEFAULT;
- if (osfs->os_bsize * osfs->os_blocks < OFD_PRECREATE_SMALL_FS)
+ if (tgd->tgd_osfs.os_bsize * tgd->tgd_osfs.os_blocks <
+ OFD_PRECREATE_SMALL_FS)
m->ofd_precreate_batch = OFD_PRECREATE_BATCH_SMALL;
rc = ofd_fs_setup(env, m, obd);
}
EXPORT_SYMBOL(osc_set_info_async);
-static int osc_reconnect(const struct lu_env *env,
- struct obd_export *exp, struct obd_device *obd,
- struct obd_uuid *cluuid,
- struct obd_connect_data *data,
- void *localdata)
+int osc_reconnect(const struct lu_env *env, struct obd_export *exp,
+ struct obd_device *obd, struct obd_uuid *cluuid,
+ struct obd_connect_data *data, void *localdata)
{
- struct client_obd *cli = &obd->u.cli;
+ struct client_obd *cli = &obd->u.cli;
- if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
- long lost_grant;
+ if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) {
+ long lost_grant;
long grant;
spin_lock(&cli->cl_loi_list_lock);
RETURN(0);
}
+EXPORT_SYMBOL(osc_reconnect);
-static int osc_disconnect(struct obd_export *exp)
+int osc_disconnect(struct obd_export *exp)
{
struct obd_device *obd = class_exp2obd(exp);
int rc;
osc_del_shrink_grant(&obd->u.cli);
return rc;
}
+EXPORT_SYMBOL(osc_disconnect);
int osc_ldlm_resource_invalidate(struct cfs_hash *hs, struct cfs_hash_bd *bd,
struct hlist_node *hnode, void *arg)
if (unlikely(rc))
GOTO(out, rc);
+ osfs->os_namelen = min_t(__u32, osfs->os_namelen, NAME_MAX);
+
spin_lock(&tgd->tgd_grant_lock);
spin_lock(&tgd->tgd_osfs_lock);
/* calculate how much space was written while we released the
u64 left;
u64 avail;
u64 unstable;
+ u64 reserved;
ENTRY;
assert_spin_locked(&tgd->tgd_grant_lock);
unstable = tgd->tgd_osfs_unstable; /* those might be accounted twice */
spin_unlock(&tgd->tgd_osfs_lock);
- tot_granted = tgd->tgd_tot_granted;
+ reserved = left * tgd->tgd_reserved_pcnt / 100;
+ tot_granted = tgd->tgd_tot_granted + reserved;
if (left < tot_granted) {
int mask = (left + unstable <
RETURN(rc);
}
EXPORT_SYMBOL(tgt_grant_commit_cb_add);
+
+
+/**
+ * Show estimate of total amount of dirty data on clients.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+int tgt_tot_dirty_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd;
+
+ LASSERT(obd != NULL);
+ tgd = &obd->u.obt.obt_lut->lut_tgd;
+ seq_printf(m, "%llu\n", tgd->tgd_tot_dirty);
+ return 0;
+}
+EXPORT_SYMBOL(tgt_tot_dirty_seq_show);
+
+/**
+ * Show total amount of space granted to clients.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+int tgt_tot_granted_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd;
+
+ LASSERT(obd != NULL);
+ tgd = &obd->u.obt.obt_lut->lut_tgd;
+ seq_printf(m, "%llu\n", tgd->tgd_tot_granted);
+ return 0;
+}
+EXPORT_SYMBOL(tgt_tot_granted_seq_show);
+
+/**
+ * Show total amount of space used by IO in progress.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+int tgt_tot_pending_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd;
+
+ LASSERT(obd != NULL);
+ tgd = &obd->u.obt.obt_lut->lut_tgd;
+ seq_printf(m, "%llu\n", tgd->tgd_tot_pending);
+ return 0;
+}
+EXPORT_SYMBOL(tgt_tot_pending_seq_show);
+
+/**
+ * Show if grants compatibility mode is disabled.
+ *
+ * When tgd_grant_compat_disable is set, we don't grant any space to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. Otherwise, space granted to such
+ * a client is inflated since it consumes PAGE_SIZE of grant space per
+ * block, (i.e. typically 4kB units), but underlaying file system might have
+ * block size bigger than page size, e.g. ZFS. See LU-2049 for details.
+ *
+ * \param[in] m seq_file handle
+ * \param[in] data unused for single entry
+ *
+ * \retval 0 on success
+ * \retval negative value on error
+ */
+int tgt_grant_compat_disable_seq_show(struct seq_file *m, void *data)
+{
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+
+ seq_printf(m, "%u\n", tgd->tgd_grant_compat_disable);
+ return 0;
+}
+EXPORT_SYMBOL(tgt_grant_compat_disable_seq_show);
+
+/**
+ * Change grant compatibility mode.
+ *
+ * Setting tgd_grant_compat_disable prohibit any space granting to clients
+ * not supporting OBD_CONNECT_GRANT_PARAM. See details above.
+ *
+ * \param[in] file proc file
+ * \param[in] buffer string which represents mode
+ * 1: disable compatibility mode
+ * 0: enable compatibility mode
+ * \param[in] count \a buffer length
+ * \param[in] off unused for single entry
+ *
+ * \retval \a count on success
+ * \retval negative number on error
+ */
+ssize_t tgt_grant_compat_disable_seq_write(struct file *file,
+ const char __user *buffer,
+ size_t count, loff_t *off)
+{
+ struct seq_file *m = file->private_data;
+ struct obd_device *obd = m->private;
+ struct tg_grants_data *tgd = &obd->u.obt.obt_lut->lut_tgd;
+ __s64 val;
+ int rc;
+
+ rc = lprocfs_str_to_s64(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val < 0)
+ return -EINVAL;
+
+ tgd->tgd_grant_compat_disable = !!val;
+
+ return count;
+}
+EXPORT_SYMBOL(tgt_grant_compat_disable_seq_write);
+
struct lu_attr attr;
struct lu_fid fid;
struct dt_object *o;
+ struct tg_grants_data *tgd = &lut->lut_tgd;
+ struct obd_statfs *osfs;
int i, rc = 0;
ENTRY;
if (!obd->obd_replayable)
RETURN(0);
+ /* initialize grant and statfs data in target */
+ dt_conf_get(env, lut->lut_bottom, &lut->lut_dt_conf);
+
+ /* statfs data */
+ spin_lock_init(&tgd->tgd_osfs_lock);
+ tgd->tgd_osfs_age = cfs_time_shift_64(-1000);
+ tgd->tgd_osfs_unstable = 0;
+ tgd->tgd_statfs_inflight = 0;
+ tgd->tgd_osfs_inflight = 0;
+
+ /* grant data */
+ spin_lock_init(&tgd->tgd_grant_lock);
+ tgd->tgd_tot_dirty = 0;
+ tgd->tgd_tot_granted = 0;
+ tgd->tgd_tot_pending = 0;
+ tgd->tgd_grant_compat_disable = 0;
+
+ /* populate cached statfs data */
+ osfs = &tgt_th_info(env)->tti_u.osfs;
+ rc = tgt_statfs_internal(env, lut, osfs, 0, NULL);
+ if (rc != 0) {
+ CERROR("%s: can't get statfs data, rc %d\n", tgt_name(lut),
+ rc);
+ GOTO(out, rc);
+ }
+ if (!is_power_of_2(osfs->os_bsize)) {
+ CERROR("%s: blocksize (%d) is not a power of 2\n",
+ tgt_name(lut), osfs->os_bsize);
+ GOTO(out, rc = -EPROTO);
+ }
+ tgd->tgd_blockbits = fls(osfs->os_bsize) - 1;
+
spin_lock_init(&lut->lut_translock);
spin_lock_init(&lut->lut_client_bitmap_lock);