From: Johann Lombardi Date: Sat, 11 Jul 2015 00:23:28 +0000 (-0700) Subject: LU-2049 grant: add support for OBD_CONNECT_GRANT_PARAM X-Git-Tag: 2.8.51~70 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=bd1e41672c974b97148b65115185a57ca4b7bbde LU-2049 grant: add support for OBD_CONNECT_GRANT_PARAM Add support for grant overhead calculation on the client side. To do so, clients track usage on a per-extent basis. An extent is composed of contiguous blocks. The OST now returns to the OSC layer several parameters to consume grant more accurately: - the backend filesystem block size which is the minimal grant allocation unit; - the maximum extent size; - the extent insertion cost. Clients now pack in bulk write how much grant space was consumed for the RPC. Dirty data accounting also adopts the same scheme. Moreover, each backend OSD now reports its own set of parameters: - For ldiskfs, we usually have a 4KB block size with a maximum extent size of 32MB (theoretical limit of 128MB) and an extent insertion cost of 6 x 4KB = 24KB - For ZFS, we report a block size of 128KB, an extent size of 128 blocks (i.e. 16MB with 128KB block size) and a block insertion cost of 112KB. Besides, there is now no more generic metadata overhead reservation done inside each OSD. Instead grant space is inflated for clients that do not support the new grant parameters. That said, a tiny percentage (typically 0.76%) of the free space is still reserved inside each OSD to avoid fragmentation which might hurt performance and impact our grant calculation (e.g. extents are broken due to fragmentation). This patch also fixes several other issues: - Bulk write resent by ptlrpc after reconnection could trigger spurious error messages related to broken dirty accounting. The issue was that oa_dirty is discarded for resent requests (grant flag cleared in ost_brw_write()), so we can legitimately have grant > fed_dirty in ofd_grant_check(). This was fixed by reseting fed_dirty on reconnection and skipping the dirty accounting check in ofd_grant_check() in the case of ptlrpc resend. - In obd_connect_data_seqprint(), the connection flags cannot fit in a 32-bit integer. - When merging two OSC extents, an extent tax should be released in both the merged extent and in the grant accounting. Signed-off-by: Johann Lombardi Signed-off-by: Jinshan Xiong Change-Id: I9c738235583324dfae7eade034db28a8161f8ef5 Reviewed-on: http://review.whamcloud.com/7793 Reviewed-by: Andreas Dilger Tested-by: Jenkins Tested-by: Maloo Reviewed-by: Nathaniel Clark Reviewed-by: Lai Siyao Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index 664e521..7c5aae7 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -75,20 +75,20 @@ typedef enum { } mntopt_t; struct dt_device_param { - unsigned ddp_max_name_len; - unsigned ddp_max_nlink; - unsigned ddp_block_shift; - mntopt_t ddp_mntopts; - unsigned ddp_max_ea_size; - int ddp_mount_type; - unsigned long long ddp_maxbytes; - /* percentage of available space to reserve for grant error margin */ - int ddp_grant_reserved; - /* per-inode space consumption */ - short ddp_inodespace; - /* per-fragment grant overhead to be used by client for grant - * calculation */ - int ddp_grant_frag; + unsigned ddp_max_name_len; + unsigned ddp_max_nlink; + unsigned ddp_symlink_max; + mntopt_t ddp_mntopts; + unsigned ddp_max_ea_size; + unsigned ddp_mount_type; + unsigned long long ddp_maxbytes; + /* per-inode space consumption */ + short ddp_inodespace; + /* maximum number of blocks in an extent */ + unsigned ddp_max_extent_blks; + /* per-extent insertion overhead to be used by client for grant + * calculation */ + unsigned ddp_extent_tax; }; /** diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index af7c90a..ed1c0d3 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -1314,7 +1314,8 @@ struct ptlrpc_body_v2 { OBD_CONNECT_LIGHTWEIGHT | OBD_CONNECT_LVB_TYPE|\ OBD_CONNECT_LAYOUTLOCK | OBD_CONNECT_FID | \ OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | \ - OBD_CONNECT_BULK_MBITS) + OBD_CONNECT_BULK_MBITS | \ + OBD_CONNECT_GRANT_PARAM) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT | \ OBD_CONNECT_FULL20 | OBD_CONNECT_IMP_RECOV | \ @@ -1336,10 +1337,10 @@ struct obd_connect_data { __u32 ocd_index; /* LOV index to connect to */ __u32 ocd_brw_size; /* Maximum BRW size in bytes */ __u64 ocd_ibits_known; /* inode bits this client understands */ - __u8 ocd_blocksize; /* log2 of the backend filesystem blocksize */ - __u8 ocd_inodespace; /* log2 of the per-inode space consumption */ - __u16 ocd_grant_extent; /* per-extent grant overhead, in 1K blocks */ - __u32 ocd_unused; /* also fix lustre_swab_connect */ + __u8 ocd_grant_blkbits; /* log2 of the backend filesystem blocksize */ + __u8 ocd_grant_inobits; /* log2 of the per-inode space consumption */ + __u16 ocd_grant_tax_kb; /* extent insertion overhead, in 1K blocks */ + __u32 ocd_grant_max_blks;/* maximum number of blocks per extent */ __u64 ocd_transno; /* first transno from client to be replayed */ __u32 ocd_group; /* MDS group on OST */ __u32 ocd_cksum_types; /* supported checksum algorithms */ diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 9890d70..be33ad1 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -117,7 +117,7 @@ struct filter_export_data { atomic_t fed_soft_sync_count; int fed_mod_count;/* items in fed_writing list */ __u32 fed_group; - __u8 fed_pagesize; /* log2 of client page size */ + __u8 fed_pagebits; /* log2 of client page size */ }; struct mgs_export_data { diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 087df0a..e94cd49 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -192,6 +192,8 @@ struct client_obd { unsigned long cl_dirty_transit; /* dirty synchronous */ unsigned long cl_avail_grant; /* bytes of credit for ost */ unsigned long cl_lost_grant; /* lost credits (trunc) */ + /* grant consumed for dirty pages */ + unsigned long cl_dirty_grant; /* since we allocate grant by blocks, we don't know how many grant will * be used to add a page into cache. As a solution, we reserve maximum @@ -206,7 +208,11 @@ struct client_obd { /* A chunk is an optimal size used by osc_extent to determine * the extent size. A chunk is max(PAGE_CACHE_SIZE, OST block size) */ int cl_chunkbits; - unsigned int cl_extent_tax; /* extent overhead, by bytes */ + /* extent insertion metadata overhead to be accounted in grant, + * in bytes */ + unsigned int cl_grant_extent_tax; + /* maximum extent size, in number of pages */ + unsigned int cl_max_extent_pages; /* keep track of objects that have lois that contain pages which * have been queued for async brw. this lock also protects the diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index f8aec41..042bc2f 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -388,6 +388,7 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSC_CP_ENQ_RACE 0x410 #define OBD_FAIL_OSC_NO_GRANT 0x411 #define OBD_FAIL_OSC_DELAY_SETTIME 0x412 +#define OBD_FAIL_OSC_CONNECT_GRANT_PARAM 0x413 #define OBD_FAIL_PTLRPC 0x500 #define OBD_FAIL_PTLRPC_ACK 0x501 diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index cf17367..097394a 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -385,6 +385,10 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, GOTO(out_md_fid, err = -ENODEV); } + /* pass client page size via ocd_grant_blkbits, the server should report + * back its backend blocksize for grant calculation purpose */ + data->ocd_grant_blkbits = PAGE_SHIFT; + data->ocd_connect_flags = OBD_CONNECT_GRANT | OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | @@ -399,6 +403,9 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt, OBD_CONNECT_PINGLESS | OBD_CONNECT_LFSCK | OBD_CONNECT_BULK_MBITS; + if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_GRANT_PARAM)) + data->ocd_connect_flags |= OBD_CONNECT_GRANT_PARAM; + if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { /* OBD_CONNECT_CKSUM should always be set, even if checksums are * disabled by default, because it can still be enabled on the diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index eab1629..eeff794 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -1994,14 +1994,14 @@ static int mdd_create_sanity_check(const struct lu_env *env, RETURN(rc); switch (cattr->la_mode & S_IFMT) { - case S_IFLNK: { - unsigned int symlen = strlen(spec->u.sp_symname) + 1; + case S_IFLNK: { + unsigned int symlen = strlen(spec->u.sp_symname) + 1; - if (symlen > (1 << m->mdd_dt_conf.ddp_block_shift)) - RETURN(-ENAMETOOLONG); - else - RETURN(0); - } + if (symlen > m->mdd_dt_conf.ddp_symlink_max) + RETURN(-ENAMETOOLONG); + else + RETURN(0); + } case S_IFDIR: case S_IFREG: case S_IFCHR: diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 917bd6b4..e6a6781 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -729,10 +729,12 @@ static void obd_connect_data_seqprint(struct seq_file *m, if (flags & OBD_CONNECT_GRANT_PARAM) seq_printf(m, " grant_block_size: %d\n" " grant_inode_size: %d\n" - " grant_extent_overhead: %d\n", - ocd->ocd_blocksize, - ocd->ocd_inodespace, - ocd->ocd_grant_extent); + " grant_max_extent_size: %d\n" + " grant_extent_tax: %d\n", + 1 << ocd->ocd_grant_blkbits, + 1 << ocd->ocd_grant_inobits, + ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits, + ocd->ocd_grant_tax_kb << 10); if (flags & OBD_CONNECT_TRANSNO) seq_printf(m, " first_transno: "LPX64"\n", ocd->ocd_transno); diff --git a/lustre/ofd/lproc_ofd.c b/lustre/ofd/lproc_ofd.c index 65b57f3..8f9b855 100644 --- a/lustre/ofd/lproc_ofd.c +++ b/lustre/ofd/lproc_ofd.c @@ -148,68 +148,6 @@ static int ofd_grant_precreate_seq_show(struct seq_file *m, void *data) LPROC_SEQ_FOPS_RO(ofd_grant_precreate); /** - * Show total amount of free space reserved for grants. - * - * \param[in] m seq_file handle - * \param[in] data unused for single entry - * - * \retval 0 on success - * \retval negative value on error - */ -static int ofd_grant_ratio_seq_show(struct seq_file *m, void *data) -{ - struct obd_device *obd = m->private; - struct ofd_device *ofd; - - LASSERT(obd != NULL); - ofd = ofd_dev(obd->obd_lu_dev); - return seq_printf(m, "%d%%\n", - (int) ofd_grant_reserved(ofd, 100)); -} - -/** - * Change amount of free space reserved for grants. - * - * \param[in] file proc file - * \param[in] buffer string which represents maximum number - * \param[in] count \a buffer length - * \param[in] off unused for single entry - * - * \retval \a count on success - * \retval negative number on error - */ -static ssize_t -ofd_grant_ratio_seq_write(struct file *file, const char __user *buffer, - size_t count, loff_t *off) -{ - struct seq_file *m = file->private_data; - struct obd_device *obd = m->private; - struct ofd_device *ofd = ofd_dev(obd->obd_lu_dev); - int val; - int rc; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - if (val > 100 || val < 0) - return -EINVAL; - - if (val == 0) - CWARN("%s: disabling grant error margin\n", obd->obd_name); - if (val > 50) - CWARN("%s: setting grant error margin >50%%, be warned that " - "a huge part of the free space is now reserved for " - "grants\n", obd->obd_name); - - spin_lock(&ofd->ofd_grant_lock); - ofd->ofd_grant_ratio = ofd_grant_ratio_conv(val); - spin_unlock(&ofd->ofd_grant_lock); - return count; -} -LPROC_SEQ_FOPS(ofd_grant_ratio); - -/** * Show number of precreates allowed in a single transaction. * * \param[in] m seq_file handle @@ -940,8 +878,6 @@ struct lprocfs_vars lprocfs_ofd_obd_vars[] = { .fops = &ofd_tot_granted_fops }, { .name = "grant_precreate", .fops = &ofd_grant_precreate_fops }, - { .name = "grant_ratio", - .fops = &ofd_grant_ratio_fops }, { .name = "precreate_batch", .fops = &ofd_precreate_batch_fops }, { .name = "recovery_status", diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index d5a880c..78b2a72 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -634,21 +634,26 @@ static int ofd_prepare(const struct lu_env *env, struct lu_device *pdev, static int ofd_recovery_complete(const struct lu_env *env, struct lu_device *dev) { + struct ofd_thread_info *oti = ofd_info(env); struct ofd_device *ofd = ofd_dev(dev); struct lu_device *next = &ofd->ofd_osd->dd_lu_dev; - int rc = 0, max_precreate; + int rc = 0; ENTRY; /* * Grant space for object precreation on the self export. - * This initial reserved space (i.e. 10MB for zfs and 280KB for ldiskfs) + * The initial reserved space (i.e. 10MB for zfs and 280KB for ldiskfs) * is enough to create 10k objects. More space is then acquired for * precreation in ofd_grant_create(). */ - max_precreate = OST_MAX_PRECREATE * ofd->ofd_dt_conf.ddp_inodespace / 2; - ofd_grant_connect(env, dev->ld_obd->obd_self_export, max_precreate, - false); + memset(&oti->fti_ocd, 0, sizeof(oti->fti_ocd)); + oti->fti_ocd.ocd_grant = OST_MAX_PRECREATE / 2; + oti->fti_ocd.ocd_grant *= ofd->ofd_dt_conf.ddp_inodespace; + oti->fti_ocd.ocd_connect_flags = OBD_CONNECT_GRANT | + OBD_CONNECT_GRANT_PARAM; + ofd_grant_connect(env, dev->ld_obd->obd_self_export, &oti->fti_ocd, + true); rc = next->ld_ops->ldo_recovery_complete(env, next); RETURN(rc); } @@ -2849,14 +2854,6 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, dt_conf_get(env, m->ofd_osd, &m->ofd_dt_conf); - /* Allow at most ddp_grant_reserved% of the available filesystem space - * to be granted to clients, so that any errors in the grant overhead - * calculations do not allow granting more space to clients than can be - * written. Assumes that in aggregate the grant overhead calculations do - * not have more than ddp_grant_reserved% estimation error in them. */ - m->ofd_grant_ratio = - ofd_grant_ratio_conv(m->ofd_dt_conf.ddp_grant_reserved); - rc = tgt_init(env, &m->ofd_lut, obd, m->ofd_osd, ofd_common_slice, OBD_FAIL_OST_ALL_REQUEST_NET, OBD_FAIL_OST_ALL_REPLY_NET); diff --git a/lustre/ofd/ofd_grant.c b/lustre/ofd/ofd_grant.c index 5ef22b5..02b86c7 100644 --- a/lustre/ofd/ofd_grant.c +++ b/lustre/ofd/ofd_grant.c @@ -26,10 +26,6 @@ * Copyright (c) 2012, 2015, Intel Corporation. */ /* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ -/* * lustre/ofd/ofd_grant.c * * This file provides code related to grant space management on Object Storage @@ -39,6 +35,30 @@ * Each client node is granted an initial amount of reserved space at connect * time and gets additional space back from OST in bulk write reply. * + * We actually support three different cases: + * - The client supports the new grant parameters (i.e. OBD_CONNECT_GRANT_PARAM) + * which means that all grant overhead calculation happens on the client side. + * The server reports at connect time the backend filesystem block size, the + * maximum extent size as well as the extent insertion cost and it is then up + * to the osc layer to the track dirty extents and consume grant accordingly + * (see osc_cache.c). In each bulk write request, the client provides how much + * grant space was consumed for this RPC. + * - The client does not support OBD_CONNECT_GRANT_PARAM and always assumes a + * a backend file system block size of 4KB. We then have two cases: + * - If the block size is really 4KB, then the client can deal with grant + * allocation for partial block writes, but won't take extent insertion cost + * into account. For such clients, we inflate grant by 100% on the server + * side. It means that when 32MB of grant is hold by the client, 64MB of + * grant space is actually reserved on the server. All grant counters + * provided by such a client are inflated by 100%. + * - The backend filesystem block size is bigger than 4KB, which isn't + * supported by the client. In this case, we emulate a 4KB block size and + * consume one block size on the server for each 4KB of grant returned to + * client. With a 128KB blocksize, it means that 32MB dirty pages of 4KB + * on the client will actually consume 1GB of grant on the server. + * All grant counters provided by such a client are inflated by the block + * size ratio. + * * This file handles the core logic for: * - grant allocation strategy * - maintaining per-client as well as global grant space accounting @@ -54,45 +74,59 @@ #include "ofd_internal.h" -/* At least enough to send a couple of 1MB RPCs, even if not max sized */ -#define OFD_GRANT_CHUNK (2ULL * DT_MAX_BRW_SIZE) - /* Clients typically hold 2x their max_rpcs_in_flight of grant space */ #define OFD_GRANT_SHRINK_LIMIT(exp) (2ULL * 8 * exp_max_brw_size(exp)) -static inline u64 ofd_grant_from_cli(struct obd_export *exp, - struct ofd_device *ofd, u64 val) +/* Helpers to inflate/deflate grants for clients that do not support the grant + * parameters */ +static inline u64 ofd_grant_inflate(struct ofd_device *ofd, u64 val) { - if (ofd_grant_compat(exp, ofd)) - /* clients not supporting OBD_CONNECT_GRANT_PARAM actually - * consume 4KB of grant per block, we should thus inflate - * the grant counters to reflect what was actually consumed */ + if (ofd->ofd_blockbits > COMPAT_BSIZE_SHIFT) + /* Client does not support such large block size, grant + * is thus inflated. We already significantly overestimate + * overhead, no need to add the extent tax in this case */ return val << (ofd->ofd_blockbits - COMPAT_BSIZE_SHIFT); - return val; + /* client can deal with the block size, but does not support per-extent + * grant accounting, inflate grant by 100% for such clients */ + return val << 1; } -static inline u64 ofd_grant_to_cli(struct obd_export *exp, - struct ofd_device *ofd, u64 val) +/* Companion of ofd_grant_inflate() */ +static inline u64 ofd_grant_deflate(struct ofd_device *ofd, u64 val) { - if (ofd_grant_compat(exp, ofd)) + if (ofd->ofd_blockbits > COMPAT_BSIZE_SHIFT) return val >> (ofd->ofd_blockbits - COMPAT_BSIZE_SHIFT); - return val; + return val >> 1; } +/* Grant chunk is used as a unit for grant allocation. It should be inflated + * if the client does not support the grant paramaters. + * Check connection flag against \a data if not NULL. This is used during + * connection creation where exp->exp_connect_data isn't populated yet */ static inline u64 ofd_grant_chunk(struct obd_export *exp, - struct ofd_device *ofd) + struct ofd_device *ofd, + struct obd_connect_data *data) { + u64 chunk = exp_max_brw_size(exp); + u64 tax; + if (ofd_obd(ofd)->obd_self_export == exp) /* Grant enough space to handle a big precreate request */ return OST_MAX_PRECREATE * ofd->ofd_dt_conf.ddp_inodespace / 2; - if (ofd_grant_compat(exp, ofd)) + if ((data == NULL && !ofd_grant_param_supp(exp)) || + (data != NULL && !OCD_HAS_FLAG(data, GRANT_PARAM))) /* Try to grant enough space to send a full-size RPC */ - return exp_max_brw_size(exp) << - (ofd->ofd_blockbits - COMPAT_BSIZE_SHIFT); - - /* Try to return enough to send two full RPCs, if needed */ - return exp_max_brw_size(exp) * 2; + return ofd_grant_inflate(ofd, chunk); + + /* Try to return enough to send two full-size RPCs + * = 2 * (BRW_size + #extents_in_BRW * grant_tax) */ + tax = 1ULL << ofd->ofd_blockbits; /* block size */ + tax *= ofd->ofd_dt_conf.ddp_max_extent_blks; /* max extent size */ + tax = (chunk + tax - 1) / tax; /* #extents in a RPC */ + tax *= ofd->ofd_dt_conf.ddp_extent_tax; /* extent tax for a RPC */ + chunk = (chunk + tax) * 2; /* we said two full RPCs */ + return chunk; } /** @@ -331,14 +365,6 @@ static u64 ofd_grant_space_left(struct obd_export *exp) /* Withdraw space already granted to clients */ left -= tot_granted; - /* If the left space is below the grant threshold x available space, - * stop granting space to clients. - * The purpose of this threshold is to keep some error margin on the - * overhead estimate made by the OSD layer. If we grant all the free - * space, we have no way (grant space cannot be revoked yet) to - * adjust if the write overhead has been underestimated. */ - left -= min_t(u64, left, ofd_grant_reserved(ofd, avail)); - /* Align left on block size */ left &= ~((1ULL << ofd->ofd_blockbits) - 1); @@ -352,25 +378,26 @@ static u64 ofd_grant_space_left(struct obd_export *exp) /** * Process grant information from obdo structure packed in incoming BRW + * and inflate grant counters if required. * - * Grab the dirty and seen grant announcements from the incoming obdo. + * Grab the dirty and seen grant announcements from the incoming obdo and + * inflate all grant counters passed in the request if the client does not + * support the grant parameters. * We will later calculate the client's new grant and return it. * Caller must hold ofd_grant_lock spinlock. * * \param[in] env LU environment supplying osfs storage * \param[in] exp export for which we received the request * \param[in,out] oa incoming obdo sent by the client - * */ static void ofd_grant_incoming(const struct lu_env *env, struct obd_export *exp, - struct obdo *oa) + struct obdo *oa, long chunk) { struct filter_export_data *fed; struct ofd_device *ofd = ofd_exp(exp); struct obd_device *obd = exp->exp_obd; long dirty; long dropped; - long grant_chunk; ENTRY; assert_spin_locked(&ofd->ofd_grant_lock); @@ -394,16 +421,23 @@ static void ofd_grant_incoming(const struct lu_env *env, struct obd_export *exp, if ((long long)oa->o_dirty < 0) oa->o_dirty = 0; - dirty = ofd_grant_from_cli(exp, ofd, oa->o_dirty); - dropped = ofd_grant_from_cli(exp, ofd, (u64)oa->o_dropped); - grant_chunk = ofd_grant_chunk(exp, ofd); + /* inflate grant counters if required */ + if (!ofd_grant_param_supp(exp)) { + oa->o_grant = ofd_grant_inflate(ofd, oa->o_grant); + oa->o_dirty = ofd_grant_inflate(ofd, oa->o_dirty); + oa->o_dropped = ofd_grant_inflate(ofd, (u64)oa->o_dropped); + oa->o_undirty = ofd_grant_inflate(ofd, oa->o_undirty); + } + + dirty = oa->o_dirty; + dropped = oa->o_dropped; /* Update our accounting now so that statfs takes it into account. * Note that fed_dirty is only approximate and can become incorrect * if RPCs arrive out-of-order. No important calculations depend * on fed_dirty however, but we must check sanity to not assert. */ - if (dirty > fed->fed_grant + 4 * grant_chunk) - dirty = fed->fed_grant + 4 * grant_chunk; + if (dirty > fed->fed_grant + 4 * chunk) + dirty = fed->fed_grant + 4 * chunk; ofd->ofd_tot_dirty += dirty - fed->fed_dirty; if (fed->fed_grant < dropped) { CDEBUG(D_CACHE, @@ -460,7 +494,7 @@ static void ofd_grant_shrink(struct obd_export *exp, struct obdo *oa, OFD_GRANT_SHRINK_LIMIT(exp)) return; - grant_shrink = ofd_grant_from_cli(exp, ofd, oa->o_grant); + grant_shrink = oa->o_grant; fed = &exp->exp_filter_data; fed->fed_grant -= grant_shrink; @@ -484,6 +518,8 @@ static void ofd_grant_shrink(struct obd_export *exp, struct obdo *oa, * larger than the minimal supported page size (i.e. 4KB). * * \param[in] exp export associated which the write request + * if NULL, then size estimate is done for server-side + * grant allocation. * \param[in] ofd ofd device handling the request * \param[in] rnb network buffer to estimate size of * @@ -494,25 +530,38 @@ static inline u64 ofd_grant_rnb_size(struct obd_export *exp, struct ofd_device *ofd, struct niobuf_remote *rnb) { - u64 blocksize; + u64 blksize; u64 bytes; u64 end; - if (exp && ofd_grant_compat(exp, ofd)) - blocksize = 1ULL << COMPAT_BSIZE_SHIFT; + if (exp && !ofd_grant_param_supp(exp) && + ofd->ofd_blockbits > COMPAT_BSIZE_SHIFT) + blksize = 1ULL << COMPAT_BSIZE_SHIFT; else - blocksize = 1ULL << ofd->ofd_blockbits; + blksize = 1ULL << ofd->ofd_blockbits; /* The network buffer might span several blocks, align it on block * boundaries */ - bytes = rnb->rnb_offset & (blocksize - 1); + bytes = rnb->rnb_offset & (blksize - 1); bytes += rnb->rnb_len; - end = bytes & (blocksize - 1); + end = bytes & (blksize - 1); if (end) - bytes += blocksize - end; - if (exp) - /* Apply per-export pecularities if one is given */ - bytes = ofd_grant_from_cli(exp, ofd, bytes); + bytes += blksize - end; + + if (exp == NULL || ofd_grant_param_supp(exp)) { + /* add per-extent insertion cost */ + u64 max_ext; + int nr_ext; + + max_ext = blksize * ofd->ofd_dt_conf.ddp_max_extent_blks; + nr_ext = (bytes + max_ext - 1) / max_ext; + bytes += nr_ext * ofd->ofd_dt_conf.ddp_extent_tax; + } else { + /* Inflate grant space if client does not support extent-based + * grant allocation */ + bytes = ofd_grant_inflate(ofd, (u64)bytes); + } + return bytes; } @@ -549,74 +598,103 @@ static void ofd_grant_check(const struct lu_env *env, struct obd_export *exp, unsigned long ungranted = 0; unsigned long granted = 0; int i; - int resend = 0; + bool skip = false; struct ofd_thread_info *info = ofd_info(env); ENTRY; assert_spin_locked(&ofd->ofd_grant_lock); - if ((oa->o_valid & OBD_MD_FLFLAGS) && - (oa->o_flags & OBD_FL_RECOV_RESEND)) { - resend = 1; + if (obd->obd_recovering) { + /* Replaying write. Grant info have been processed already so no + * need to do any enforcement here. It is worth noting that only + * bulk writes with all rnbs having OBD_BRW_FROM_GRANT can be + * replayed. If one page hasn't OBD_BRW_FROM_GRANT set, then + * the whole bulk is written synchronously */ + skip = true; + CDEBUG(D_CACHE, "Replaying write, skipping accounting\n"); + } else if ((oa->o_valid & OBD_MD_FLFLAGS) && + (oa->o_flags & OBD_FL_RECOV_RESEND)) { + /* Recoverable resend, grant info have already been processed as + * well */ + skip = true; CDEBUG(D_CACHE, "Recoverable resend arrived, skipping " "accounting\n"); + } else if (ofd_grant_param_supp(exp) && oa->o_grant_used > 0) { + /* Client supports the new grant parameters and is telling us + * how much grant space it consumed for this bulk write. + * Although all rnbs are supposed to have the OBD_BRW_FROM_GRANT + * flag set, we will scan the rnb list and looks for non-cache + * I/O in case it changes in the future */ + if (fed->fed_grant >= oa->o_grant_used) { + /* skip grant accounting for rnbs with + * OBD_BRW_FROM_GRANT and just used grant consumption + * claimed in the request */ + granted = oa->o_grant_used; + skip = true; + } else { + /* client has used more grants for this request that + * it owns ... */ + CERROR("%s: cli %s claims %lu GRANT, real grant %lu\n", + exp->exp_obd->obd_name, + exp->exp_client_uuid.uuid, + (unsigned long)oa->o_grant_used, fed->fed_grant); + + /* check whether we can fill the gap with unallocated + * grant */ + if (*left > (oa->o_grant_used - fed->fed_grant)) { + /* ouf .. we are safe for now */ + granted = fed->fed_grant; + ungranted = oa->o_grant_used - granted; + *left -= ungranted; + skip = true; + } + /* too bad, but we cannot afford to blow up our grant + * accounting. The loop below will handle each rnb in + * case by case. */ + } } for (i = 0; i < niocount; i++) { int bytes; - if (obd->obd_recovering) { - /* Replaying write. Grant info have been processed - * already so no need to do any enforcement here. - * It is worth noting that only bulk writes with all - * rnbs having OBD_BRW_FROM_GRANT can be replayed. - * If one page hasn't OBD_BRW_FROM_GRANT set, then - * the whole bulk is written synchronously */ - if (rnb[i].rnb_flags & OBD_BRW_FROM_GRANT) { - rnb[i].rnb_flags |= OBD_BRW_GRANTED; - continue; - } else { - CERROR("%s: cli %s is replaying OST_WRITE " - "while one rnb hasn't OBD_BRW_FROM_GRANT" - " set (0x%x)\n", exp->exp_obd->obd_name, - exp->exp_client_uuid.uuid, - rnb[i].rnb_flags); - - } - } else if ((oa->o_valid & OBD_MD_FLGRANT) && - (rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) { - if (resend) { - /* This is a recoverable resend so grant - * information have already been processed */ + if ((rnb[i].rnb_flags & OBD_BRW_FROM_GRANT)) { + if (skip) { rnb[i].rnb_flags |= OBD_BRW_GRANTED; continue; } - /* inflate consumed space if needed */ + /* compute how much grant space is actually needed for + * this rnb, inflate grant if required */ bytes = ofd_grant_rnb_size(exp, ofd, &rnb[i]); - if (fed->fed_grant < granted + bytes) { - CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d " - "GRANT, real grant %lu idx %d\n", - exp->exp_obd->obd_name, - exp->exp_client_uuid.uuid, exp, - granted, bytes, fed->fed_grant, i); - } else { + if (fed->fed_grant >= granted + bytes) { granted += bytes; rnb[i].rnb_flags |= OBD_BRW_GRANTED; continue; } + + CDEBUG(D_CACHE, "%s: cli %s/%p claims %ld+%d GRANT, " + "real grant %lu idx %d\n", obd->obd_name, + exp->exp_client_uuid.uuid, exp, granted, bytes, + fed->fed_grant, i); } + if (obd->obd_recovering) + CERROR("%s: cli %s is replaying OST_WRITE while one rnb" + " hasn't OBD_BRW_FROM_GRANT set (0x%x)\n", + obd->obd_name, exp->exp_client_uuid.uuid, + rnb[i].rnb_flags); + /* Consume grant space on the server. * Unlike above, ofd_grant_rnb_size() is called with exp = NULL * so that the required grant space isn't inflated. This is * done on purpose since the server can deal with large block * size, unlike some clients */ bytes = ofd_grant_rnb_size(NULL, ofd, &rnb[i]); - if (*left > ungranted + bytes) { + if (*left > bytes) { /* if enough space, pretend it was granted */ ungranted += bytes; + *left -= bytes; rnb[i].rnb_flags |= OBD_BRW_GRANTED; continue; } @@ -624,26 +702,25 @@ static void ofd_grant_check(const struct lu_env *env, struct obd_export *exp, /* We can't check for already-mapped blocks here (make sense * when backend filesystem does not use COW) as it requires * dropping the grant lock. - * Instead, we clear ~OBD_BRW_GRANTED and in that case we need + * Instead, we clear OBD_BRW_GRANTED and in that case we need * to go through and verify if all of the blocks not marked * BRW_GRANTED are already mapped and we can ignore this error. */ rnb[i].rnb_flags &= ~OBD_BRW_GRANTED; CDEBUG(D_CACHE,"%s: cli %s/%p idx %d no space for %d\n", - exp->exp_obd->obd_name, - exp->exp_client_uuid.uuid, exp, i, bytes); + obd->obd_name, exp->exp_client_uuid.uuid, exp, i, bytes); } /* record in o_grant_used the actual space reserved for the I/O, will be * used later in ofd_grant_commmit() */ oa->o_grant_used = granted + ungranted; + info->fti_used = granted + ungranted; + /* record space used for the I/O, will be used in ofd_grant_commmit() */ /* Now substract what the clients has used already. We don't subtract * this from the tot_granted yet, so that other client's can't grab * that space before we have actually allocated our blocks. That * happens in ofd_grant_commit() after the writes are done. */ - info->fti_used = granted + ungranted; - *left -= ungranted; fed->fed_grant -= granted; fed->fed_pending += oa->o_grant_used; ofd->ofd_tot_granted += ungranted; @@ -654,8 +731,9 @@ static void ofd_grant_check(const struct lu_env *env, struct obd_export *exp, "\n", obd->obd_name, exp->exp_client_uuid.uuid, exp, granted, ungranted, fed->fed_grant, fed->fed_dirty); - if (obd->obd_recovering) - /* don't update dirty accounting during recovery */ + if (obd->obd_recovering || (oa->o_valid & OBD_MD_FLGRANT) == 0) + /* don't update dirty accounting during recovery or + * if grant information got discarded (e.g. during resend) */ RETURN_EXIT; if (fed->fed_dirty < granted) { @@ -698,12 +776,12 @@ static void ofd_grant_check(const struct lu_env *env, struct obd_export *exp, * \retval amount of grant space allocated */ static long ofd_grant_alloc(struct obd_export *exp, u64 curgrant, - u64 want, u64 left, bool conservative) + u64 want, u64 left, long chunk, + bool conservative) { struct obd_device *obd = exp->exp_obd; struct ofd_device *ofd = ofd_exp(exp); struct filter_export_data *fed = &exp->exp_filter_data; - long grant_chunk; u64 grant; ENTRY; @@ -717,12 +795,6 @@ static long ofd_grant_alloc(struct obd_export *exp, u64 curgrant, RETURN(0); } - /* client not supporting OBD_CONNECT_GRANT_PARAM works with a 4KB block - * size while the reality is different */ - curgrant = ofd_grant_from_cli(exp, ofd, curgrant); - want = ofd_grant_from_cli(exp, ofd, want); - grant_chunk = ofd_grant_chunk(exp, ofd); - /* Grant some fraction of the client's requested grant space so that * they are not always waiting for write credits (not all of it to * avoid overgranting in face of multiple RPCs in flight). This @@ -732,8 +804,8 @@ static long ofd_grant_alloc(struct obd_export *exp, u64 curgrant, * has and what we think it has, don't grant very much and let the * client consume its grant first. Either it just has lots of RPCs * in flight, or it was evicted and its grants will soon be used up. */ - if (curgrant >= want || curgrant >= fed->fed_grant + grant_chunk) - RETURN(0); + if (curgrant >= want || curgrant >= fed->fed_grant + chunk) + RETURN(0); if (obd->obd_recovering) conservative = false; @@ -743,16 +815,16 @@ static long ofd_grant_alloc(struct obd_export *exp, u64 curgrant, * one chunk */ left >>= 3; grant = min(want - curgrant, left); - /* round grant upt to the next block size */ + /* round grant up to the next block size */ grant = (grant + (1 << ofd->ofd_blockbits) - 1) & ~((1ULL << ofd->ofd_blockbits) - 1); if (!grant) RETURN(0); - /* Limit to ofd_grant_chunk() if not reconnect/recovery */ - if ((grant > grant_chunk) && conservative) - grant = grant_chunk; + /* Limit to grant_chunk if not reconnect/recovery */ + if ((grant > chunk) && conservative) + grant = chunk; ofd->ofd_tot_granted += grant; fed->fed_grant += grant; @@ -775,7 +847,7 @@ static long ofd_grant_alloc(struct obd_export *exp, u64 curgrant, exp, ofd->ofd_tot_dirty, ofd->ofd_tot_granted, obd->obd_num_exports); - RETURN(ofd_grant_to_cli(exp, ofd, grant)); + RETURN(grant); } /** @@ -788,27 +860,37 @@ static long ofd_grant_alloc(struct obd_export *exp, u64 curgrant, * * \param[in] env LU environment provided by the caller * \param[in] exp client's export which is (re)connecting - * \param[in] want how much grant space the client would like to get + * \param[in,out] data obd_connect_data structure sent by the client in the + * connect request * \param[in] new_conn must set to true if this is a new connection and false * for a reconnection - * - * \retval amount of grant space currently owned by the client */ -long ofd_grant_connect(const struct lu_env *env, struct obd_export *exp, - u64 want, bool new_conn) +void ofd_grant_connect(const struct lu_env *env, struct obd_export *exp, + struct obd_connect_data *data, bool new_conn) { struct ofd_device *ofd = ofd_exp(exp); struct filter_export_data *fed = &exp->exp_filter_data; u64 left = 0; - long grant; + u64 want; + long chunk; int from_cache; int force = 0; /* can use cached data */ /* don't grant space to client with read-only access */ - if ((exp_connect_flags(exp) & OBD_CONNECT_RDONLY) || - ofd_grant_prohibit(exp, ofd)) - return 0; + if (OCD_HAS_FLAG(data, RDONLY) || + (!OCD_HAS_FLAG(data, GRANT_PARAM) && + ofd->ofd_grant_compat_disable)) { + data->ocd_grant = 0; + data->ocd_connect_flags &= ~(OBD_CONNECT_GRANT | + OBD_CONNECT_GRANT_PARAM); + RETURN_EXIT; + } + if (OCD_HAS_FLAG(data, GRANT_PARAM)) + want = data->ocd_grant; + else + want = ofd_grant_inflate(ofd, data->ocd_grant); + chunk = ofd_grant_chunk(exp, ofd, data); refresh: ofd_grant_statfs(env, exp, force, &from_cache); @@ -819,28 +901,37 @@ refresh: left = ofd_grant_space_left(exp); /* get fresh statfs data if we are short in ungranted space */ - if (from_cache && left < 32 * ofd_grant_chunk(exp, ofd)) { + if (from_cache && left < 32 * chunk) { spin_unlock(&ofd->ofd_grant_lock); CDEBUG(D_CACHE, "fs has no space left and statfs too old\n"); force = 1; goto refresh; } - ofd_grant_alloc(exp, - ofd_grant_to_cli(exp, ofd, (u64)fed->fed_grant), - want, left, new_conn); + ofd_grant_alloc(exp, (u64)fed->fed_grant, want, left, chunk, new_conn); /* return to client its current grant */ - grant = ofd_grant_to_cli(exp, ofd, (u64)fed->fed_grant); - ofd->ofd_tot_granted_clients++; + if (OCD_HAS_FLAG(data, GRANT_PARAM)) + data->ocd_grant = fed->fed_grant; + else + /* deflate grant */ + data->ocd_grant = ofd_grant_deflate(ofd, + (u64)fed->fed_grant); + + /* reset dirty accounting */ + ofd->ofd_tot_dirty -= fed->fed_dirty; + fed->fed_dirty = 0; + + if (new_conn && OCD_HAS_FLAG(data, GRANT)) + ofd->ofd_tot_granted_clients++; spin_unlock(&ofd->ofd_grant_lock); - CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %ld want: "LPU64" left: " + CDEBUG(D_CACHE, "%s: cli %s/%p ocd_grant: %d want: "LPU64" left: " LPU64"\n", exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, - exp, grant, want, left); + exp, data->ocd_grant, want, left); - return grant; + EXIT; } /** @@ -900,14 +991,15 @@ void ofd_grant_prepare_read(const struct lu_env *env, struct ofd_device *ofd = ofd_exp(exp); int do_shrink; u64 left = 0; + ENTRY; if (!oa) - return; + RETURN_EXIT; if ((oa->o_valid & OBD_MD_FLGRANT) == 0) /* The read request does not contain any grant * information */ - return; + RETURN_EXIT; if ((oa->o_valid & OBD_MD_FLFLAGS) && (oa->o_flags & OBD_FL_SHRINK_GRANT)) { @@ -935,8 +1027,9 @@ void ofd_grant_prepare_read(const struct lu_env *env, do_shrink = 0; } - /* extract incoming grant infomation provided by the client */ - ofd_grant_incoming(env, exp, oa); + /* extract incoming grant information provided by the client and + * inflate grant counters if required */ + ofd_grant_incoming(env, exp, oa, ofd_grant_chunk(exp, ofd, NULL)); /* unlike writes, we don't return grants back on reads unless a grant * shrink request was packed and we decided to turn it down. */ @@ -945,7 +1038,10 @@ void ofd_grant_prepare_read(const struct lu_env *env, else oa->o_grant = 0; + if (!ofd_grant_param_supp(exp)) + oa->o_grant = ofd_grant_deflate(ofd, oa->o_grant); spin_unlock(&ofd->ofd_grant_lock); + EXIT; } /** @@ -978,6 +1074,7 @@ void ofd_grant_prepare_write(const struct lu_env *env, u64 left; int from_cache; int force = 0; /* can use cached data intially */ + long chunk = ofd_grant_chunk(exp, ofd, NULL); int rc; ENTRY; @@ -993,7 +1090,7 @@ refresh: left = ofd_grant_space_left(exp); /* Get fresh statfs data if we are short in ungranted space */ - if (from_cache && left < 32 * ofd_grant_chunk(exp, ofd)) { + if (from_cache && left < 32 * chunk) { spin_unlock(&ofd->ofd_grant_lock); CDEBUG(D_CACHE, "%s: fs has no space left and statfs too old\n", obd->obd_name); @@ -1004,7 +1101,7 @@ refresh: /* When close to free space exhaustion, trigger a sync to force * writeback cache to consume required space immediately and release as * much space as possible. */ - if (!obd->obd_recovering && force != 2 && left < OFD_GRANT_CHUNK) { + if (!obd->obd_recovering && force != 2 && left < chunk) { bool from_grant = true; int i; @@ -1026,8 +1123,9 @@ refresh: } } - /* extract incoming grant information provided by the client */ - ofd_grant_incoming(env, exp, oa); + /* extract incoming grant information provided by the client, + * and inflate grant counters if required */ + ofd_grant_incoming(env, exp, oa, chunk); /* check limit */ ofd_grant_check(env, exp, oa, rnb, niocount, &left); @@ -1045,8 +1143,12 @@ refresh: else /* grant more space back to the client if possible */ oa->o_grant = ofd_grant_alloc(exp, oa->o_grant, oa->o_undirty, - left, true); + left, chunk, true); + + if (!ofd_grant_param_supp(exp)) + oa->o_grant = ofd_grant_deflate(ofd, oa->o_grant); spin_unlock(&ofd->ofd_grant_lock); + EXIT; } /** @@ -1137,10 +1239,14 @@ long ofd_grant_create(const struct lu_env *env, struct obd_export *exp, int *nr) /* grant more space for precreate purpose if possible. */ wanted = OST_MAX_PRECREATE * ofd->ofd_dt_conf.ddp_inodespace / 2; if (wanted > fed->fed_grant) { + long chunk; + /* always try to book enough space to handle a large precreate * request */ + chunk = ofd_grant_chunk(exp, ofd, NULL); wanted -= fed->fed_grant; - ofd_grant_alloc(exp, fed->fed_grant, wanted, left, false); + ofd_grant_alloc(exp, fed->fed_grant, wanted, left, chunk, + false); } spin_unlock(&ofd->ofd_grant_lock); RETURN(granted); diff --git a/lustre/ofd/ofd_internal.h b/lustre/ofd/ofd_internal.h index ad868be..662f833 100644 --- a/lustre/ofd/ofd_internal.h +++ b/lustre/ofd/ofd_internal.h @@ -161,11 +161,6 @@ struct ofd_device { u64 ofd_tot_granted; /* grant used by I/Os in progress (between prepare and commit) */ u64 ofd_tot_pending; - /* free space threshold over which we stop granting space to clients - * ofd_grant_ratio is stored as a fixed-point fraction using - * OFD_GRANT_RATIO_SHIFT of the remaining free space, not in percentage - * values */ - int ofd_grant_ratio; /* number of clients using grants */ int ofd_tot_granted_clients; @@ -321,7 +316,10 @@ struct ofd_thread_info { /* Space used by the I/O, used by grant code */ unsigned long fti_used; struct ost_lvb fti_lvb; - struct lfsck_request fti_lr; + union { + struct lfsck_request fti_lr; + struct obd_connect_data fti_ocd; + }; }; extern void target_recovery_fini(struct obd_device *obd); @@ -431,17 +429,6 @@ struct ofd_object *ofd_object_find_exists(const struct lu_env *env, } /* ofd_grants.c */ -#define OFD_GRANT_RATIO_SHIFT 8 -static inline u64 ofd_grant_reserved(struct ofd_device *ofd, u64 bavail) -{ - return (bavail * ofd->ofd_grant_ratio) >> OFD_GRANT_RATIO_SHIFT; -} - -static inline int ofd_grant_ratio_conv(int percentage) -{ - return (percentage << OFD_GRANT_RATIO_SHIFT) / 100; -} - static inline int ofd_grant_param_supp(struct obd_export *exp) { return !!(exp_connect_flags(exp) & OBD_CONNECT_GRANT_PARAM); @@ -451,16 +438,6 @@ static inline int ofd_grant_param_supp(struct obd_export *exp) * That's 4KB=2^12 which is the biggest block size known to work whatever * the client's page size is. */ #define COMPAT_BSIZE_SHIFT 12 -static inline int ofd_grant_compat(struct obd_export *exp, - struct ofd_device *ofd) -{ - /* Clients which don't support OBD_CONNECT_GRANT_PARAM cannot handle - * a block size > page size and consume PAGE_CACHE_SIZE of grant when - * dirtying a page regardless of the block size */ - return !!(ofd_obd(ofd)->obd_self_export != exp && - ofd->ofd_blockbits > COMPAT_BSIZE_SHIFT && - !ofd_grant_param_supp(exp)); -} static inline int ofd_grant_prohibit(struct obd_export *exp, struct ofd_device *ofd) @@ -469,12 +446,13 @@ static inline int ofd_grant_prohibit(struct obd_export *exp, * clients not supporting OBD_CONNECT_GRANT_PARAM. * Otherwise, space granted to such a client is inflated since it * consumes PAGE_CACHE_SIZE of grant space per block */ - return !!(ofd_grant_compat(exp, ofd) && ofd->ofd_grant_compat_disable); + return !!(ofd_obd(ofd)->obd_self_export != exp && + !ofd_grant_param_supp(exp) && ofd->ofd_grant_compat_disable); } void ofd_grant_sanity_check(struct obd_device *obd, const char *func); -long ofd_grant_connect(const struct lu_env *env, struct obd_export *exp, - u64 want, bool new_conn); +void ofd_grant_connect(const struct lu_env *env, struct obd_export *exp, + struct obd_connect_data *data, bool new_conn); void ofd_grant_discard(struct obd_export *exp); void ofd_grant_prepare_read(const struct lu_env *env, struct obd_export *exp, struct obdo *oa); diff --git a/lustre/ofd/ofd_obd.c b/lustre/ofd/ofd_obd.c index 1be25b5..d42e1c5 100644 --- a/lustre/ofd/ofd_obd.c +++ b/lustre/ofd/ofd_obd.c @@ -173,18 +173,21 @@ static int ofd_parse_connect_data(const struct lu_env *env, else if (data->ocd_connect_flags & OBD_CONNECT_SKIP_ORPHAN) RETURN(-EPROTO); - if (ofd_grant_param_supp(exp)) { - exp->exp_filter_data.fed_pagesize = data->ocd_blocksize; - /* ocd_{blocksize,inodespace} are log2 values */ - data->ocd_blocksize = ofd->ofd_blockbits; - data->ocd_inodespace = ofd->ofd_dt_conf.ddp_inodespace; - /* ocd_grant_extent is in 1K blocks */ - data->ocd_grant_extent = ofd->ofd_dt_conf.ddp_grant_frag >> 10; + if (OCD_HAS_FLAG(data, GRANT_PARAM)) { + /* client is reporting its page size, for future use */ + exp->exp_filter_data.fed_pagebits = data->ocd_grant_blkbits; + data->ocd_grant_blkbits = ofd->ofd_blockbits; + /* ddp_inodespace may not be power-of-two value, eg. for ldiskfs + * it's LDISKFS_DIR_REC_LEN(20) = 28. */ + data->ocd_grant_inobits = + fls(ofd->ofd_dt_conf.ddp_inodespace - 1); + /* ocd_grant_tax_kb is in 1K byte blocks */ + data->ocd_grant_tax_kb = ofd->ofd_dt_conf.ddp_extent_tax >> 10; + data->ocd_grant_max_blks = ofd->ofd_dt_conf.ddp_max_extent_blks; } - if (data->ocd_connect_flags & OBD_CONNECT_GRANT) - data->ocd_grant = ofd_grant_connect(env, exp, data->ocd_grant, - new_connection); + if (OCD_HAS_FLAG(data, GRANT)) + ofd_grant_connect(env, exp, data, new_connection); if (data->ocd_connect_flags & OBD_CONNECT_INDEX) { struct lr_server_data *lsd = &ofd->ofd_lut.lut_lsd; @@ -508,10 +511,8 @@ static int ofd_destroy_export(struct obd_export *exp) ofd_grant_discard(exp); ofd_fmd_cleanup(exp); - if (exp_connect_flags(exp) & OBD_CONNECT_GRANT_SHRINK) { - if (ofd->ofd_tot_granted_clients > 0) - ofd->ofd_tot_granted_clients --; - } + if (exp_connect_flags(exp) & OBD_CONNECT_GRANT) + ofd->ofd_tot_granted_clients--; if (!(exp->exp_flags & OBD_OPT_FORCE)) ofd_grant_sanity_check(exp->exp_obd, __FUNCTION__); @@ -848,7 +849,8 @@ int ofd_statfs(const struct lu_env *env, struct obd_export *exp, if (ofd->ofd_raid_degraded) osfs->os_state |= OS_STATE_DEGRADED; - if (obd->obd_self_export != exp && ofd_grant_compat(exp, ofd)) { + if (obd->obd_self_export != exp && !ofd_grant_param_supp(exp) && + ofd->ofd_blockbits > COMPAT_BSIZE_SHIFT) { /* clients which don't support OBD_CONNECT_GRANT_PARAM * should not see a block size > page size, otherwise * cl_lost_grant goes mad. Therefore, we emulate a 4KB (=2^12) diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index 983442f..d3d5f71 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -317,6 +317,19 @@ static int osc_cur_lost_grant_bytes_seq_show(struct seq_file *m, void *v) } LPROC_SEQ_FOPS_RO(osc_cur_lost_grant_bytes); +static int osc_cur_dirty_grant_bytes_seq_show(struct seq_file *m, void *v) +{ + struct obd_device *dev = m->private; + struct client_obd *cli = &dev->u.cli; + int rc; + + spin_lock(&cli->cl_loi_list_lock); + rc = seq_printf(m, "%lu\n", cli->cl_dirty_grant); + spin_unlock(&cli->cl_loi_list_lock); + return rc; +} +LPROC_SEQ_FOPS_RO(osc_cur_dirty_grant_bytes); + static int osc_grant_shrink_interval_seq_show(struct seq_file *m, void *v) { struct obd_device *obd = m->private; @@ -628,6 +641,8 @@ struct lprocfs_vars lprocfs_osc_obd_vars[] = { .fops = &osc_cur_grant_bytes_fops }, { .name = "cur_lost_grant_bytes", .fops = &osc_cur_lost_grant_bytes_fops }, + { .name = "cur_dirty_grant_bytes", + .fops = &osc_cur_dirty_grant_bytes_fops }, { .name = "grant_shrink_interval", .fops = &osc_grant_shrink_interval_fops }, { .name = "checksums", diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index afa0eaa..2b15cb8 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -58,13 +58,16 @@ static int osc_refresh_count(const struct lu_env *env, static int osc_io_unplug_async(const struct lu_env *env, struct client_obd *cli, struct osc_object *osc); static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, - unsigned int lost_grant); + unsigned int lost_grant, unsigned int dirty_grant); static void osc_extent_tree_dump0(int level, struct osc_object *obj, const char *func, int line); #define osc_extent_tree_dump(lvl, obj) \ osc_extent_tree_dump0(lvl, obj, __func__, __LINE__) +static void osc_unreserve_grant(struct client_obd *cli, unsigned int reserved, + unsigned int unused); + /** \addtogroup osc * @{ */ @@ -497,15 +500,16 @@ static void osc_extent_remove(struct osc_extent *ext) /** * This function is used to merge extents to get better performance. It checks - * if @cur and @victim are contiguous at chunk level. + * if @cur and @victim are contiguous at block level. */ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, struct osc_extent *victim) { - struct osc_object *obj = cur->oe_obj; - pgoff_t chunk_start; - pgoff_t chunk_end; - int ppc_bits; + struct osc_object *obj = cur->oe_obj; + struct client_obd *cli = osc_cli(obj); + pgoff_t chunk_start; + pgoff_t chunk_end; + int ppc_bits; LASSERT(cur->oe_state == OES_CACHE); LASSERT(osc_object_is_locked(obj)); @@ -526,11 +530,18 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, chunk_end + 1 != victim->oe_start >> ppc_bits) return -ERANGE; + /* overall extent size should not exceed the max supported limit + * reported by the server */ + if (cur->oe_end - cur->oe_start + 1 + + victim->oe_end - victim->oe_start + 1 > cli->cl_max_extent_pages) + return -ERANGE; + OSC_EXTENT_DUMP(D_CACHE, victim, "will be merged by %p.\n", cur); cur->oe_start = min(cur->oe_start, victim->oe_start); cur->oe_end = max(cur->oe_end, victim->oe_end); - cur->oe_grants += victim->oe_grants; + /* per-extent tax should be accounted only once for the whole extent */ + cur->oe_grants += victim->oe_grants - cli->cl_grant_extent_tax; cur->oe_nr_pages += victim->oe_nr_pages; /* only the following bits are needed to merge */ cur->oe_urgent |= victim->oe_urgent; @@ -553,6 +564,7 @@ static int osc_extent_merge(const struct lu_env *env, struct osc_extent *cur, int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) { struct osc_object *obj = ext->oe_obj; + struct client_obd *cli = osc_cli(obj); int rc = 0; ENTRY; @@ -569,13 +581,19 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) osc_extent_state_set(ext, OES_TRUNC); ext->oe_trunc_pending = 0; } else { + int grant = 0; + osc_extent_state_set(ext, OES_CACHE); osc_update_pending(obj, OBD_BRW_WRITE, ext->oe_nr_pages); /* try to merge the previous and next extent. */ - osc_extent_merge(env, ext, prev_extent(ext)); - osc_extent_merge(env, ext, next_extent(ext)); + if (osc_extent_merge(env, ext, prev_extent(ext)) == 0) + grant += cli->cl_grant_extent_tax; + if (osc_extent_merge(env, ext, next_extent(ext)) == 0) + grant += cli->cl_grant_extent_tax; + if (grant > 0) + osc_unreserve_grant(cli, 0, grant); if (ext->oe_urgent) list_move_tail(&ext->oe_link, @@ -583,7 +601,7 @@ int osc_extent_release(const struct lu_env *env, struct osc_extent *ext) } osc_object_unlock(obj); - osc_io_unplug_async(env, osc_cli(obj), obj); + osc_io_unplug_async(env, cli, obj); } osc_extent_put(env, ext); RETURN(rc); @@ -658,8 +676,8 @@ static struct osc_extent *osc_extent_find(const struct lu_env *env, } /* grants has been allocated by caller */ - LASSERTF(*grants >= chunksize + cli->cl_extent_tax, - "%u/%u/%u.\n", *grants, chunksize, cli->cl_extent_tax); + LASSERTF(*grants >= chunksize + cli->cl_grant_extent_tax, + "%u/%u/%u.\n", *grants, chunksize, cli->cl_grant_extent_tax); LASSERTF((max_end - cur->oe_start) < max_pages, EXTSTR"\n", EXTPARA(cur)); @@ -732,6 +750,13 @@ restart: continue; } + /* check whether maximum extent size will be hit */ + if ((ext_chk_end - ext_chk_start + 1 + 1) << ppc_bits > + cli->cl_max_extent_pages) { + ext = next_extent(ext); + continue; + } + /* it's required that an extent must be contiguous at chunk * level so that we know the whole extent is covered by grant * (the pages in the extent are NOT required to be contiguous). @@ -759,7 +784,7 @@ restart: * in a gap */ if (osc_extent_merge(env, ext, next_extent(ext)) == 0) /* we can save extent tax from next extent */ - *grants += cli->cl_extent_tax; + *grants += cli->cl_grant_extent_tax; found = osc_extent_hold(ext); } @@ -780,7 +805,7 @@ restart: } else if (conflict == NULL) { /* create a new extent */ EASSERT(osc_extent_is_overlapped(obj, cur) == 0, cur); - cur->oe_grants = chunksize + cli->cl_extent_tax; + cur->oe_grants = chunksize + cli->cl_grant_extent_tax; *grants -= cur->oe_grants; LASSERT(*grants >= 0); @@ -865,7 +890,7 @@ int osc_extent_finish(const struct lu_env *env, struct osc_extent *ext, lost_grant = PAGE_CACHE_SIZE - count; } if (ext->oe_grants > 0) - osc_free_grant(cli, nr_pages, lost_grant); + osc_free_grant(cli, nr_pages, lost_grant, ext->oe_grants); osc_extent_remove(ext); /* put the refcount for RPC */ @@ -1040,7 +1065,7 @@ static int osc_extent_truncate(struct osc_extent *ext, pgoff_t trunc_index, osc_object_unlock(obj); if (grants > 0 || nr_pages > 0) - osc_free_grant(cli, nr_pages, grants); + osc_free_grant(cli, nr_pages, grants, grants); out: cl_io_fini(env, io); @@ -1158,9 +1183,14 @@ static int osc_extent_expand(struct osc_extent *ext, pgoff_t index, GOTO(out, rc = 0); LASSERT(end_chunk + 1 == chunk); + /* try to expand this extent to cover @index */ end_index = min(ext->oe_max_end, ((chunk + 1) << ppc_bits) - 1); + /* don't go over the maximum extent size reported by server */ + if (end_index - ext->oe_start + 1 > cli->cl_max_extent_pages) + GOTO(out, rc = -ERANGE); + next = next_extent(ext); if (next != NULL && next->oe_start <= end_index) /* complex mode - overlapped with the next extent, @@ -1327,13 +1357,15 @@ static int osc_completion(const struct lu_env *env, struct osc_async_page *oap, #define OSC_DUMP_GRANT(lvl, cli, fmt, args...) do { \ struct client_obd *__tmp = (cli); \ - CDEBUG(lvl, "%s: grant { dirty: %lu/%lu dirty_pages: %ld/%lu " \ - "dropped: %ld avail: %ld, reserved: %ld, flight: %d }" \ - "lru {in list: %ld, left: %ld, waiters: %d }"fmt"\n", \ + CDEBUG(lvl, "%s: grant { dirty: %ld/%ld dirty_pages: %ld/%lu " \ + "dropped: %ld avail: %ld, dirty_grant: %ld, " \ + "reserved: %ld, flight: %d } lru {in list: %ld, " \ + "left: %ld, waiters: %d }" fmt "\n", \ cli_name(__tmp), \ __tmp->cl_dirty_pages, __tmp->cl_dirty_max_pages, \ atomic_long_read(&obd_dirty_pages), obd_max_dirty_pages, \ __tmp->cl_lost_grant, __tmp->cl_avail_grant, \ + __tmp->cl_dirty_grant, \ __tmp->cl_reserved_grant, __tmp->cl_w_in_flight, \ atomic_long_read(&__tmp->cl_lru_in_list), \ atomic_long_read(&__tmp->cl_lru_busy), \ @@ -1407,8 +1439,10 @@ static void __osc_unreserve_grant(struct client_obd *cli, if (unused > reserved) { cli->cl_avail_grant += reserved; cli->cl_lost_grant += unused - reserved; + cli->cl_dirty_grant -= unused - reserved; } else { cli->cl_avail_grant += unused; + cli->cl_dirty_grant += reserved - unused; } } @@ -1436,14 +1470,17 @@ static void osc_unreserve_grant(struct client_obd *cli, * See filter_grant_check() for details. */ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, - unsigned int lost_grant) + unsigned int lost_grant, unsigned int dirty_grant) { - unsigned long grant = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + unsigned long grant; + + grant = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; spin_lock(&cli->cl_loi_list_lock); atomic_long_sub(nr_pages, &obd_dirty_pages); cli->cl_dirty_pages -= nr_pages; cli->cl_lost_grant += lost_grant; + cli->cl_dirty_grant -= dirty_grant; if (cli->cl_avail_grant < grant && cli->cl_lost_grant >= grant) { /* borrow some grant from truncate to avoid the case that * truncate uses up all avail grant */ @@ -1452,9 +1489,10 @@ static void osc_free_grant(struct client_obd *cli, unsigned int nr_pages, } osc_wake_cache_waiters(cli); spin_unlock(&cli->cl_loi_list_lock); - CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu\n", + CDEBUG(D_CACHE, "lost %u grant: %lu avail: %lu dirty: %lu/%lu\n", lost_grant, cli->cl_lost_grant, - cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_CACHE_SHIFT); + cli->cl_avail_grant, cli->cl_dirty_pages << PAGE_CACHE_SHIFT, + cli->cl_dirty_grant); } /** @@ -2334,7 +2372,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, if (ext != NULL && ext->oe_start <= index && ext->oe_max_end >= index) { /* one chunk plus extent overhead must be enough to write this * page */ - grants = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + grants = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; if (ext->oe_end >= index) grants = 0; @@ -2371,7 +2409,7 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, } if (ext == NULL) { - tmp = (1 << cli->cl_chunkbits) + cli->cl_extent_tax; + tmp = (1 << cli->cl_chunkbits) + cli->cl_grant_extent_tax; /* try to find new extent to cover this page */ LASSERT(oio->oi_active == NULL); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 72871ae..2ef9e455 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -573,7 +573,10 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, oa->o_valid |= bits; spin_lock(&cli->cl_loi_list_lock); - oa->o_dirty = cli->cl_dirty_pages << PAGE_CACHE_SHIFT; + if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data, GRANT_PARAM)) + oa->o_dirty = cli->cl_dirty_grant; + else + oa->o_dirty = cli->cl_dirty_pages << PAGE_CACHE_SHIFT; if (unlikely(cli->cl_dirty_pages - cli->cl_dirty_transit > cli->cl_dirty_max_pages)) { CERROR("dirty %lu - %lu > dirty_max %lu\n", @@ -597,11 +600,22 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, cli->cl_dirty_pages, cli->cl_dirty_max_pages); oa->o_undirty = 0; } else { - unsigned long max_in_flight = (cli->cl_max_pages_per_rpc << - PAGE_CACHE_SHIFT) * - (cli->cl_max_rpcs_in_flight + 1); - oa->o_undirty = max(cli->cl_dirty_max_pages << PAGE_CACHE_SHIFT, - max_in_flight); + unsigned long nrpages; + + nrpages = cli->cl_max_pages_per_rpc; + nrpages *= cli->cl_max_rpcs_in_flight + 1; + nrpages = max(nrpages, cli->cl_dirty_max_pages); + oa->o_undirty = nrpages << PAGE_CACHE_SHIFT; + if (OCD_HAS_FLAG(&cli->cl_import->imp_connect_data, + GRANT_PARAM)) { + int nrextents; + + /* take extent tax into account when asking for more + * grant space */ + nrextents = (nrpages + cli->cl_max_extent_pages - 1) / + cli->cl_max_extent_pages; + oa->o_undirty += nrextents * cli->cl_grant_extent_tax; + } } oa->o_grant = cli->cl_avail_grant + cli->cl_reserved_grant; oa->o_dropped = cli->cl_lost_grant; @@ -609,7 +623,6 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, spin_unlock(&cli->cl_loi_list_lock); CDEBUG(D_CACHE,"dirty: "LPU64" undirty: %u dropped %u grant: "LPU64"\n", oa->o_dirty, oa->o_undirty, oa->o_dropped, oa->o_grant); - } void osc_update_next_shrink(struct client_obd *cli) @@ -807,11 +820,15 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) * left EVICTED state, then cl_dirty_pages must be 0 already. */ spin_lock(&cli->cl_loi_list_lock); - if (cli->cl_import->imp_state == LUSTRE_IMP_EVICTED) - cli->cl_avail_grant = ocd->ocd_grant; - else - cli->cl_avail_grant = ocd->ocd_grant - - (cli->cl_dirty_pages << PAGE_CACHE_SHIFT); + cli->cl_avail_grant = ocd->ocd_grant; + if (cli->cl_import->imp_state != LUSTRE_IMP_EVICTED) { + cli->cl_avail_grant -= cli->cl_reserved_grant; + if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) + cli->cl_avail_grant -= cli->cl_dirty_grant; + else + cli->cl_avail_grant -= + cli->cl_dirty_pages << PAGE_CACHE_SHIFT; + } if (cli->cl_avail_grant < 0) { CWARN("%s: available grant < 0: avail/ocd/dirty %ld/%u/%ld\n", @@ -822,13 +839,31 @@ static void osc_init_grant(struct client_obd *cli, struct obd_connect_data *ocd) cli->cl_avail_grant = ocd->ocd_grant; } - /* determine the appropriate chunk size used by osc_extent. */ - cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, ocd->ocd_blocksize); + if (OCD_HAS_FLAG(ocd, GRANT_PARAM)) { + u64 size; + + /* overhead for each extent insertion */ + cli->cl_grant_extent_tax = ocd->ocd_grant_tax_kb << 10; + /* determine the appropriate chunk size used by osc_extent. */ + cli->cl_chunkbits = max_t(int, PAGE_CACHE_SHIFT, + ocd->ocd_grant_blkbits); + /* determine maximum extent size, in #pages */ + size = (u64)ocd->ocd_grant_max_blks << ocd->ocd_grant_blkbits; + cli->cl_max_extent_pages = size >> PAGE_CACHE_SHIFT; + if (cli->cl_max_extent_pages == 0) + cli->cl_max_extent_pages = 1; + } else { + cli->cl_grant_extent_tax = 0; + cli->cl_chunkbits = PAGE_CACHE_SHIFT; + cli->cl_max_extent_pages = DT_MAX_BRW_PAGES; + } spin_unlock(&cli->cl_loi_list_lock); CDEBUG(D_CACHE, "%s, setting cl_avail_grant: %ld cl_lost_grant: %ld." - "chunk bits: %d.\n", cli_name(cli), cli->cl_avail_grant, - cli->cl_lost_grant, cli->cl_chunkbits); + "chunk bits: %d cl_max_extent_pages: %d\n", + cli_name(cli), + cli->cl_avail_grant, cli->cl_lost_grant, cli->cl_chunkbits, + cli->cl_max_extent_pages); if (ocd->ocd_connect_flags & OBD_CONNECT_GRANT_SHRINK && list_empty(&cli->cl_grant_shrink_list)) @@ -1649,6 +1684,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, bool soft_sync = false; bool interrupted = false; int i; + int grant = 0; int rc; struct list_head rpc_list = LIST_HEAD_INIT(rpc_list); struct ost_body *body; @@ -1659,6 +1695,7 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, list_for_each_entry(ext, ext_list, oe_link) { LASSERT(ext->oe_state == OES_RPC); mem_tight |= ext->oe_memalloc; + grant += ext->oe_grants; page_count += ext->oe_nr_pages; if (obj == NULL) obj = ext->oe_obj; @@ -1715,6 +1752,9 @@ int osc_build_rpc(const struct lu_env *env, struct client_obd *cli, crattr->cra_oa = oa; cl_req_attr_set(env, osc2cl(obj), crattr); + if (cmd == OBD_BRW_WRITE) + oa->o_grant_used = grant; + sort_brw_pages(pga, page_count); rc = osc_brw_prep_request(cmd, cli, oa, page_count, pga, &req, 0); if (rc != 0) { @@ -2431,11 +2471,15 @@ static int osc_reconnect(const struct lu_env *env, if (data != NULL && (data->ocd_connect_flags & OBD_CONNECT_GRANT)) { long lost_grant; + long grant; spin_lock(&cli->cl_loi_list_lock); - data->ocd_grant = (cli->cl_avail_grant + - (cli->cl_dirty_pages << PAGE_CACHE_SHIFT)) ?: - 2 * cli_brw_size(obd); + grant = cli->cl_avail_grant + cli->cl_reserved_grant; + if (data->ocd_connect_flags & OBD_CONNECT_GRANT_PARAM) + grant += cli->cl_dirty_grant; + else + grant += cli->cl_dirty_pages << PAGE_CACHE_SHIFT; + data->ocd_grant = grant ? : 2 * cli_brw_size(obd); lost_grant = cli->cl_lost_grant; cli->cl_lost_grant = 0; spin_unlock(&cli->cl_loi_list_lock); diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 265d758..f7033e8 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -60,6 +60,7 @@ #include #include +#include #undef ENTRY /* * struct OBD_{ALLOC,FREE}*() @@ -1582,18 +1583,17 @@ static int osd_object_print(const struct lu_env *env, void *cookie, d ? d->id_ops->id_name : "plain"); } -#define GRANT_FOR_LOCAL_OIDS 32 /* 128kB for last_rcvd, quota files, ... */ - /* * Concurrency: shouldn't matter. */ int osd_statfs(const struct lu_env *env, struct dt_device *d, struct obd_statfs *sfs) { - struct osd_device *osd = osd_dt_dev(d); - struct super_block *sb = osd_sb(osd); - struct kstatfs *ksfs; - int result = 0; + struct osd_device *osd = osd_dt_dev(d); + struct super_block *sb = osd_sb(osd); + struct kstatfs *ksfs; + __u64 reserved; + int result = 0; if (unlikely(osd->od_mnt == NULL)) return -EINPROGRESS; @@ -1607,34 +1607,40 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d, ksfs = &osd_oti_get(env)->oti_ksfs; } - spin_lock(&osd->od_osfs_lock); result = sb->s_op->statfs(sb->s_root, ksfs); - if (likely(result == 0)) { /* N.B. statfs can't really fail */ - statfs_pack(sfs, ksfs); - if (unlikely(sb->s_flags & MS_RDONLY)) - sfs->os_state = OS_STATE_READONLY; - if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, - LDISKFS_FEATURE_INCOMPAT_EXTENTS)) - sfs->os_maxbytes = sb->s_maxbytes; - else - sfs->os_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes; - } - spin_unlock(&osd->od_osfs_lock); + if (result) + goto out; + + statfs_pack(sfs, ksfs); + if (unlikely(sb->s_flags & MS_RDONLY)) + sfs->os_state = OS_STATE_READONLY; + if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, + LDISKFS_FEATURE_INCOMPAT_EXTENTS)) + sfs->os_maxbytes = sb->s_maxbytes; + else + sfs->os_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes; - if (unlikely(env == NULL)) - OBD_FREE_PTR(ksfs); + /* + * Reserve some space so to avoid fragmenting the filesystem too much. + * Fragmentation not only impacts performance, but can also increase + * metadata overhead significantly, causing grant calculation to be + * wrong. + * + * Reserve 0.78% of total space, at least 8MB for small filesystems. + */ + CLASSERT(OSD_STATFS_RESERVED > LDISKFS_MAX_BLOCK_SIZE); + reserved = OSD_STATFS_RESERVED >> sb->s_blocksize_bits; + if (likely(sfs->os_blocks >= reserved << OSD_STATFS_RESERVED_SHIFT)) + reserved = sfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT; - /* Reserve a small amount of space for local objects like last_rcvd, - * llog, quota files, ... */ - if (sfs->os_bavail <= GRANT_FOR_LOCAL_OIDS) { - sfs->os_bavail = 0; - } else { - sfs->os_bavail -= GRANT_FOR_LOCAL_OIDS; - /** Take out metadata overhead for indirect blocks */ - sfs->os_bavail -= sfs->os_bavail >> (sb->s_blocksize_bits - 3); - } + sfs->os_blocks -= reserved; + sfs->os_bfree -= min(reserved, sfs->os_bfree); + sfs->os_bavail -= min(reserved, sfs->os_bavail); - return result; +out: + if (unlikely(env == NULL)) + OBD_FREE_PTR(ksfs); + return result; } /** @@ -1663,21 +1669,23 @@ static void osd_conf_get(const struct lu_env *env, */ param->ddp_max_name_len = LDISKFS_NAME_LEN; param->ddp_max_nlink = LDISKFS_LINK_MAX; - param->ddp_block_shift = sb->s_blocksize_bits; + param->ddp_symlink_max = sb->s_blocksize; param->ddp_mount_type = LDD_MT_LDISKFS; if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_EXTENTS)) param->ddp_maxbytes = sb->s_maxbytes; else param->ddp_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes; - /* Overhead estimate should be fairly accurate, so we really take a tiny - * error margin which also avoids fragmenting the filesystem too much */ - param->ddp_grant_reserved = 2; /* end up to be 1.9% after conversion */ /* inode are statically allocated, so per-inode space consumption * is the space consumed by the directory entry */ param->ddp_inodespace = PER_OBJ_USAGE; - /* per-fragment overhead to be used by the client code */ - param->ddp_grant_frag = 6 * LDISKFS_BLOCK_SIZE(sb); - param->ddp_mntopts = 0; + /* EXT_INIT_MAX_LEN is the theoretical maximum extent size (32k blocks + * = 128MB) which is unlikely to be hit in real life. Report a smaller + * maximum length to not under count the actual number of extents + * needed for writing a file. */ + param->ddp_max_extent_blks = EXT_INIT_MAX_LEN >> 2; + /* worst-case extent insertion metadata overhead */ + param->ddp_extent_tax = 6 * LDISKFS_BLOCK_SIZE(sb); + param->ddp_mntopts = 0; if (test_opt(sb, XATTR_USER)) param->ddp_mntopts |= MNTOPT_USERXATTR; if (test_opt(sb, POSIX_ACL)) diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index ec6fdd8..e3a8d59 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -90,6 +90,10 @@ extern struct kmem_cache *dynlock_cachep; #define ADMIN_USR "admin_quotafile_v2.usr" #define ADMIN_GRP "admin_quotafile_v2.grp" +/* Statfs space reservation for fragmentation and local objects */ +#define OSD_STATFS_RESERVED (1ULL << 23) /* 8MB */ +#define OSD_STATFS_RESERVED_SHIFT (7) /* reserve 0.78% of all space */ + struct osd_directory { struct iam_container od_container; struct iam_descr od_descr; diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 4f78957..9c19163 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -214,9 +214,8 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, struct osd_device *osd = osd_dt_dev(d); /* dmu will call commit callback with error code during abort */ if (!lu_device_is_md(&d->dd_lu_dev) && rc == -ENOSPC) - CERROR("%s: failed to start transaction due to ENOSPC. " - "Metadata overhead is underestimated or " - "grant_ratio is too low.\n", osd->od_svname); + CERROR("%s: failed to start transaction due to ENOSPC" + "\n", osd->od_svname); else CERROR("%s: can't assign tx: rc = %d\n", osd->od_svname, rc); @@ -474,14 +473,13 @@ static int osd_objset_statfs(struct osd_device *osd, struct obd_statfs *osfs) * for internal files to be created/unlinked when space is tight. */ CLASSERT(OSD_STATFS_RESERVED_SIZE > 0); - if (likely(osfs->os_blocks >= OSD_STATFS_RESERVED_SIZE)) + reserved = OSD_STATFS_RESERVED_SIZE >> bshift; + if (likely(osfs->os_blocks >= reserved << OSD_STATFS_RESERVED_SHIFT)) reserved = osfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT; - else - reserved = OSD_STATFS_RESERVED_SIZE >> bshift; osfs->os_blocks -= reserved; - osfs->os_bfree -= MIN(reserved, osfs->os_bfree); - osfs->os_bavail -= MIN(reserved, osfs->os_bavail); + osfs->os_bfree -= min(reserved, osfs->os_bfree); + osfs->os_bavail -= min(reserved, osfs->os_bavail); /* * The availobjs value returned from dmu_objset_space() is largely @@ -563,7 +561,7 @@ static void osd_conf_get(const struct lu_env *env, */ param->ddp_max_name_len = MAXNAMELEN; param->ddp_max_nlink = 1 << 31; /* it's 8byte on a disk */ - param->ddp_block_shift = 12; /* XXX */ + param->ddp_symlink_max = PATH_MAX; param->ddp_mount_type = LDD_MT_ZFS; param->ddp_mntopts = MNTOPT_USERXATTR; @@ -574,20 +572,22 @@ static void osd_conf_get(const struct lu_env *env, /* for maxbytes, report same value as ZPL */ param->ddp_maxbytes = MAX_LFS_FILESIZE; - /* Default reserved fraction of the available space that should be kept - * for error margin. Unfortunately, there are many factors that can - * impact the overhead with zfs, so let's be very cautious for now and - * reserve 20% of the available space which is not given out as grant. - * This tunable can be changed on a live system via procfs if needed. */ - param->ddp_grant_reserved = 20; - /* inodes are dynamically allocated, so we report the per-inode space * consumption to upper layers. This static value is not really accurate * and we should use the same logic as in udmu_objset_statfs() to * estimate the real size consumed by an object */ param->ddp_inodespace = OSD_DNODE_EST_COUNT; - /* per-fragment overhead to be used by the client code */ - param->ddp_grant_frag = osd_blk_insert_cost(osd); + /* Although ZFS isn't an extent-based filesystem, the metadata overhead + * (i.e. 7 levels of indirect blocks, see osd_blk_insert_cost()) should + * not be accounted for every single new block insertion. + * Instead, the maximum extent size is set to the number of blocks that + * can fit into a single contiguous indirect block. There would be some + * cases where this crosses indirect blocks, but it also won't have 7 + * new levels of indirect blocks in that case either, so it will still + * have enough reserved space for the extra indirect block */ + param->ddp_max_extent_blks = + (1 << (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)); + param->ddp_extent_tax = osd_blk_insert_cost(osd); } /* diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 3f43f61..5606d07 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -1663,8 +1663,8 @@ void lustre_swab_connect(struct obd_connect_data *ocd) __swab32s(&ocd->ocd_brw_size); /* ocd_blocksize and ocd_inodespace don't need to be swabbed because * they are 8-byte values */ - __swab16s(&ocd->ocd_grant_extent); - __swab32s(&ocd->ocd_unused); + __swab16s(&ocd->ocd_grant_tax_kb); + __swab32s(&ocd->ocd_grant_max_blks); __swab64s(&ocd->ocd_transno); __swab32s(&ocd->ocd_group); __swab32s(&ocd->ocd_cksum_types); diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index 97e32cc..6fbc0e1 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -972,22 +972,22 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_unused)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_blkbits) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_blkbits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_inobits) == 33, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_inobits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_tax_kb) == 34, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_tax_kb)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_max_blks) == 36, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_max_blks)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks)); LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n", diff --git a/lustre/utils/mount_utils_ldiskfs.c b/lustre/utils/mount_utils_ldiskfs.c index f3e9c2b..0aa9b4f 100644 --- a/lustre/utils/mount_utils_ldiskfs.c +++ b/lustre/utils/mount_utils_ldiskfs.c @@ -696,9 +696,9 @@ int ldiskfs_make_lustre(struct mkfs_opts *mop) } if (mop->mo_device_kb != 0) { - if (mop->mo_device_kb < 8096) { + if (mop->mo_device_kb < 32384) { fprintf(stderr, "%s: size of filesystem must be larger " - "than 8MB, but is set to %lldKB\n", + "than 32MB, but is set to %lldKB\n", progname, (long long)mop->mo_device_kb); return EINVAL; } diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index cbb62c2..41a8ae3 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -446,10 +446,10 @@ check_obd_connect_data(void) CHECK_MEMBER(obd_connect_data, ocd_index); CHECK_MEMBER(obd_connect_data, ocd_brw_size); CHECK_MEMBER(obd_connect_data, ocd_ibits_known); - CHECK_MEMBER(obd_connect_data, ocd_blocksize); - CHECK_MEMBER(obd_connect_data, ocd_inodespace); - CHECK_MEMBER(obd_connect_data, ocd_grant_extent); - CHECK_MEMBER(obd_connect_data, ocd_unused); + CHECK_MEMBER(obd_connect_data, ocd_grant_blkbits); + CHECK_MEMBER(obd_connect_data, ocd_grant_inobits); + CHECK_MEMBER(obd_connect_data, ocd_grant_tax_kb); + CHECK_MEMBER(obd_connect_data, ocd_grant_max_blks); CHECK_MEMBER(obd_connect_data, ocd_transno); CHECK_MEMBER(obd_connect_data, ocd_group); CHECK_MEMBER(obd_connect_data, ocd_cksum_types); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 8bcf604..79f3ea7 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -986,22 +986,22 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct obd_connect_data, ocd_ibits_known)); LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known) == 8, "found %lld\n", (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_ibits_known)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_blocksize) == 32, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_blocksize)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize) == 1, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_blocksize)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_inodespace) == 33, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_inodespace)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace) == 1, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_inodespace)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_extent) == 34, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_grant_extent)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent) == 2, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_extent)); - LASSERTF((int)offsetof(struct obd_connect_data, ocd_unused) == 36, "found %lld\n", - (long long)(int)offsetof(struct obd_connect_data, ocd_unused)); - LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_unused) == 4, "found %lld\n", - (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_unused)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_blkbits) == 32, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_blkbits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_blkbits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_inobits) == 33, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_inobits)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits) == 1, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_inobits)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_tax_kb) == 34, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_tax_kb)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb) == 2, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_tax_kb)); + LASSERTF((int)offsetof(struct obd_connect_data, ocd_grant_max_blks) == 36, "found %lld\n", + (long long)(int)offsetof(struct obd_connect_data, ocd_grant_max_blks)); + LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks) == 4, "found %lld\n", + (long long)(int)sizeof(((struct obd_connect_data *)0)->ocd_grant_max_blks)); LASSERTF((int)offsetof(struct obd_connect_data, ocd_transno) == 40, "found %lld\n", (long long)(int)offsetof(struct obd_connect_data, ocd_transno)); LASSERTF((int)sizeof(((struct obd_connect_data *)0)->ocd_transno) == 8, "found %lld\n",