From d7ac66d2fddc3b2a6fb91b6421f9a15b80c8d10a Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Fri, 15 Feb 2013 13:19:29 +0400 Subject: [PATCH] LU-2748 osd: allocate buffers on demand instead of putting a lot of buffers statically within osd_thread_info, we can allocate them on the first demand within this thread. we also can allocate not the maximum, but some optimal amount and reallocate if really needed. dr_created is not used, so removed. the number of blocks is calculated using actual blocksize, not the smallest one, so no need to multiply by 8 in 99.9% cases. with PTLRPC_MAX_BRW_PAGES=1024 (as default in master branch) and regular 1MB IO, before: sizeof(struct osd_thread_info) = 82104 after: sizeof(struct osd_thread_info) = 4328 + 4K (if IO thread) should improve threads not doing IO: all MDS threads, LDLM threads, MGS threads. Signed-off-by: Alex Zhuravlev Change-Id: Ie07780537a4598c6a888ed9be4ef0bbb0d9b3d54 Reviewed-on: http://review.whamcloud.com/5444 Tested-by: Hudson Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin Reviewed-by: Mike Pershin Tested-by: Maloo --- lustre/include/linux/lustre_fsfilt.h | 13 +++-- lustre/include/lu_object.h | 4 ++ lustre/lvfs/fsfilt_ext3.c | 64 ++++++++++-------------- lustre/obdclass/lu_object.c | 30 ++++++++++++ lustre/osd-ldiskfs/osd_handler.c | 14 +++--- lustre/osd-ldiskfs/osd_internal.h | 42 ++++++++-------- lustre/osd-ldiskfs/osd_io.c | 95 ++++++++++++++++++++++++++---------- 7 files changed, 163 insertions(+), 99 deletions(-) diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 6b3e8b6..f18f13d 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -62,9 +62,8 @@ struct fsfilt_operations { int logs); int (* fs_commit)(struct inode *inode, void *handle,int force_sync); int (* fs_map_inode_pages)(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, - int *created, int create, - struct mutex *sem); + int pages, unsigned long *blocks, + int create, struct mutex *sem); int (* fs_write_record)(struct file *, void *, int size, loff_t *, int force_sync); int (* fs_read_record)(struct file *, void *, int size, loff_t *); @@ -148,13 +147,13 @@ static inline int fsfilt_commit(struct obd_device *obd, struct inode *inode, } static inline int fsfilt_map_inode_pages(struct obd_device *obd, - struct inode *inode, - struct page **page, int pages, - unsigned long *blocks, int *created, + struct inode *inode, + struct page **page, int pages, + unsigned long *blocks, int create, struct mutex *mutex) { return obd->obd_fsops->fs_map_inode_pages(inode, page, pages, blocks, - created, create, mutex); + create, mutex); } static inline int fsfilt_read_record(struct obd_device *obd, struct file *file, diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 52fae16..5f67a06 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -1356,5 +1356,9 @@ struct lu_object *lu_object_anon(const struct lu_env *env, struct lu_device *dev, const struct lu_object_conf *conf); +void lu_buf_free(struct lu_buf *buf); +void lu_buf_alloc(struct lu_buf *buf, int size); +void lu_buf_realloc(struct lu_buf *buf, int size); + /** @} lu */ #endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 10ebac1..17a971b 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -195,7 +195,6 @@ static int fsfilt_ext3_commit(struct inode *inode, void *h, int force_sync) struct bpointers { unsigned long *blocks; - int *created; unsigned long start; int num; int init_num; @@ -317,8 +316,6 @@ static int ext3_ext_new_extent_cb(struct ext3_ext_base *base, CERROR("nothing to do?! i = %d, e_num = %u\n", i, cex->ec_len); for (; i < cex->ec_len && bp->num; i++) { - *(bp->created) = 0; - bp->created++; *(bp->blocks) = 0; bp->blocks++; bp->num--; @@ -418,20 +415,16 @@ map: for (; i < cex->ec_len && bp->num; i++) { *(bp->blocks) = cex->ec_start + i; #ifdef EXT3_EXT_CACHE_EXTENT - if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) + if (cex->ec_type != EXT3_EXT_CACHE_EXTENT) #else - if ((cex->ec_len != 0) && (cex->ec_start != 0)) + if ((cex->ec_len == 0) || (cex->ec_start == 0)) #endif - { - *(bp->created) = 0; - } else { - *(bp->created) = 1; + { /* unmap any possible underlying metadata from * the block device mapping. bug 6998. */ ll_unmap_underlying_metadata(inode->i_sb, *(bp->blocks)); } - bp->created++; bp->blocks++; bp->num--; bp->start++; @@ -441,8 +434,8 @@ map: } int fsfilt_map_nblocks(struct inode *inode, unsigned long block, - unsigned long num, unsigned long *blocks, - int *created, int create) + unsigned long num, unsigned long *blocks, + int create) { struct ext3_ext_base *base = inode; struct bpointers bp; @@ -452,7 +445,6 @@ int fsfilt_map_nblocks(struct inode *inode, unsigned long block, block, block + num - 1, (unsigned) inode->i_ino); bp.blocks = blocks; - bp.created = created; bp.start = block; bp.init_num = bp.num = num; bp.create = create; @@ -465,8 +457,8 @@ int fsfilt_map_nblocks(struct inode *inode, unsigned long block, } int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, - int *created, int create) + int pages, unsigned long *blocks, + int create) { int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits; int rc = 0, i = 0; @@ -495,65 +487,61 @@ int fsfilt_ext3_map_ext_inode_pages(struct inode *inode, struct page **page, /* process found extent */ rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, - clen * blocks_per_page, blocks, - created, create); + clen * blocks_per_page, blocks, + create); if (rc) GOTO(cleanup, rc); /* look for next extent */ fp = NULL; blocks += blocks_per_page * clen; - created += blocks_per_page * clen; } if (fp) rc = fsfilt_map_nblocks(inode, fp->index * blocks_per_page, - clen * blocks_per_page, blocks, - created, create); + clen * blocks_per_page, blocks, + create); cleanup: return rc; } extern int ext3_map_inode_page(struct inode *inode, struct page *page, - unsigned long *blocks, int *created, int create); + unsigned long *blocks, int create); int fsfilt_ext3_map_bm_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, - int *created, int create) + int pages, unsigned long *blocks, + int create) { - int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits; - unsigned long *b; - int rc = 0, i, *cr; + int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits; + unsigned long *b; + int rc = 0, i; - for (i = 0, cr = created, b = blocks; i < pages; i++, page++) { - rc = ext3_map_inode_page(inode, *page, b, cr, create); + for (i = 0, b = blocks; i < pages; i++, page++) { + rc = ext3_map_inode_page(inode, *page, b, create); if (rc) { - CERROR("ino %lu, blk %lu cr %u create %d: rc %d\n", - inode->i_ino, *b, *cr, create, rc); + CERROR("ino %lu, blk %lu create %d: rc %d\n", + inode->i_ino, *b, create, rc); break; } b += blocks_per_page; - cr += blocks_per_page; } return rc; } int fsfilt_ext3_map_inode_pages(struct inode *inode, struct page **page, - int pages, unsigned long *blocks, - int *created, int create, - struct mutex *optional_mutex) + int pages, unsigned long *blocks, + int create, struct mutex *optional_mutex) { int rc; if (EXT3_I(inode)->i_flags & EXT3_EXTENTS_FL) { - rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages, - blocks, created, create); + rc = fsfilt_ext3_map_ext_inode_pages(inode, page, pages, + blocks, create); return rc; } if (optional_mutex != NULL) mutex_lock(optional_mutex); - rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, - created, create); + rc = fsfilt_ext3_map_bm_inode_pages(inode, page, pages, blocks, create); if (optional_mutex != NULL) mutex_unlock(optional_mutex); diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index 2679063..117f607 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -2226,3 +2226,33 @@ struct lu_object *lu_object_anon(const struct lu_env *env, return o; } EXPORT_SYMBOL(lu_object_anon); + +void lu_buf_free(struct lu_buf *buf) +{ + LASSERT(buf); + if (buf->lb_buf) { + LASSERT(buf->lb_len > 0); + OBD_FREE_LARGE(buf->lb_buf, buf->lb_len); + buf->lb_buf = NULL; + buf->lb_len = 0; + } +} +EXPORT_SYMBOL(lu_buf_free); + +void lu_buf_alloc(struct lu_buf *buf, int size) +{ + LASSERT(buf); + LASSERT(buf->lb_buf == NULL); + LASSERT(buf->lb_len == 0); + OBD_ALLOC_LARGE(buf->lb_buf, size); + if (likely(buf->lb_buf)) + buf->lb_len = size; +} +EXPORT_SYMBOL(lu_buf_alloc); + +void lu_buf_realloc(struct lu_buf *buf, int size) +{ + lu_buf_free(buf); + lu_buf_alloc(buf, size); +} +EXPORT_SYMBOL(lu_buf_realloc); diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 7374b06..a281d32 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -5038,12 +5038,14 @@ static void *osd_key_init(const struct lu_context *ctx, static void osd_key_fini(const struct lu_context *ctx, struct lu_context_key *key, void* data) { - struct osd_thread_info *info = data; - - if (info->oti_hlock != NULL) - ldiskfs_htree_lock_free(info->oti_hlock); - OBD_FREE(info->oti_it_ea_buf, OSD_IT_EA_BUFSIZE); - OBD_FREE_PTR(info); + struct osd_thread_info *info = data; + + if (info->oti_hlock != NULL) + ldiskfs_htree_lock_free(info->oti_hlock); + OBD_FREE(info->oti_it_ea_buf, OSD_IT_EA_BUFSIZE); + lu_buf_free(&info->oti_iobuf.dr_pg_buf); + lu_buf_free(&info->oti_iobuf.dr_bl_buf); + OBD_FREE_PTR(info); } static void osd_key_exit(const struct lu_context *ctx, diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 60cf124..1c36e18 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -463,20 +463,22 @@ struct osd_it_quota { #define MAX_BLOCKS_PER_PAGE (CFS_PAGE_SIZE / 512) struct osd_iobuf { - cfs_waitq_t dr_wait; - cfs_atomic_t dr_numreqs; /* number of reqs being processed */ - int dr_max_pages; - int dr_npages; - int dr_error; - int dr_frags; - unsigned int dr_ignore_quota:1; - unsigned int dr_elapsed_valid:1; /* we really did count time */ - unsigned int dr_rw:1; - struct page *dr_pages[PTLRPC_MAX_BRW_PAGES]; - unsigned long dr_blocks[PTLRPC_MAX_BRW_PAGES*MAX_BLOCKS_PER_PAGE]; - unsigned long dr_start_time; - unsigned long dr_elapsed; /* how long io took */ - struct osd_device *dr_dev; + cfs_waitq_t dr_wait; + cfs_atomic_t dr_numreqs; /* number of reqs being processed */ + int dr_max_pages; + int dr_npages; + int dr_error; + int dr_frags; + unsigned int dr_ignore_quota:1; + unsigned int dr_elapsed_valid:1; /* we really did count time */ + unsigned int dr_rw:1; + struct lu_buf dr_pg_buf; + struct page **dr_pages; + struct lu_buf dr_bl_buf; + unsigned long *dr_blocks; + unsigned long dr_start_time; + unsigned long dr_elapsed; /* how long io took */ + struct osd_device *dr_dev; unsigned int dr_init_at; /* the line iobuf was initialized */ }; @@ -564,14 +566,12 @@ struct osd_thread_info { /* old LMA for compatibility */ char oti_mdt_attrs_old[LMA_OLD_SIZE]; }; - /** 0-copy IO */ - struct osd_iobuf oti_iobuf; - struct inode oti_inode; - int oti_created[PTLRPC_MAX_BRW_PAGES]; - struct lu_env oti_obj_delete_tx_env; + /** 0-copy IO */ + struct osd_iobuf oti_iobuf; + struct inode oti_inode; #define OSD_FID_REC_SZ 32 - char oti_ldp[OSD_FID_REC_SZ]; - char oti_ldp2[OSD_FID_REC_SZ]; + char oti_ldp[OSD_FID_REC_SZ]; + char oti_ldp2[OSD_FID_REC_SZ]; /* used by quota code */ union { diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index dac9d00..b094b50 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -93,17 +93,19 @@ int generic_error_remove_page(struct address_space *mapping, struct page *page) } #endif -static void __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, - int rw, int line) +static int __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, + int rw, int line, int pages) { + int blocks, i; + LASSERTF(iobuf->dr_elapsed_valid == 0, "iobuf %p, reqs %d, rw %d, line %d\n", iobuf, cfs_atomic_read(&iobuf->dr_numreqs), iobuf->dr_rw, iobuf->dr_init_at); + LASSERT(pages <= PTLRPC_MAX_BRW_PAGES); cfs_waitq_init(&iobuf->dr_wait); cfs_atomic_set(&iobuf->dr_numreqs, 0); - iobuf->dr_max_pages = PTLRPC_MAX_BRW_PAGES; iobuf->dr_npages = 0; iobuf->dr_error = 0; iobuf->dr_dev = d; @@ -112,8 +114,43 @@ static void __osd_init_iobuf(struct osd_device *d, struct osd_iobuf *iobuf, /* must be counted before, so assert */ iobuf->dr_rw = rw; iobuf->dr_init_at = line; + + blocks = pages * (CFS_PAGE_SIZE >> osd_sb(d)->s_blocksize_bits); + if (iobuf->dr_bl_buf.lb_len >= blocks * sizeof(iobuf->dr_blocks[0])) { + LASSERT(iobuf->dr_pg_buf.lb_len >= + pages * sizeof(iobuf->dr_pages[0])); + return 0; + } + + /* start with 1MB for 4K blocks */ + i = 256; + while (i <= PTLRPC_MAX_BRW_PAGES && i < pages) + i <<= 1; + + CDEBUG(D_OTHER, "realloc %u for %u (%u) pages\n", + (unsigned)(pages * sizeof(iobuf->dr_pages[0])), i, pages); + pages = i; + blocks = pages * (CFS_PAGE_SIZE >> osd_sb(d)->s_blocksize_bits); + iobuf->dr_max_pages = 0; + CDEBUG(D_OTHER, "realloc %u for %u blocks\n", + (unsigned)(blocks * sizeof(iobuf->dr_blocks[0])), blocks); + + lu_buf_realloc(&iobuf->dr_bl_buf, blocks * sizeof(iobuf->dr_blocks[0])); + iobuf->dr_blocks = iobuf->dr_bl_buf.lb_buf; + if (unlikely(iobuf->dr_blocks == NULL)) + return -ENOMEM; + + lu_buf_realloc(&iobuf->dr_pg_buf, pages * sizeof(iobuf->dr_pages[0])); + iobuf->dr_pages = iobuf->dr_pg_buf.lb_buf; + if (unlikely(iobuf->dr_pages == NULL)) + return -ENOMEM; + + iobuf->dr_max_pages = pages; + + return 0; } -#define osd_init_iobuf(dev,iobuf,rw) __osd_init_iobuf(dev, iobuf, rw, __LINE__) +#define osd_init_iobuf(dev, iobuf, rw, pages) \ + __osd_init_iobuf(dev, iobuf, rw, __LINE__, pages) static void osd_iobuf_add_page(struct osd_iobuf *iobuf, struct page *page) { @@ -550,9 +587,11 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); - osd_init_iobuf(osd, iobuf, 0); + rc = osd_init_iobuf(osd, iobuf, 0, npages); + if (unlikely(rc != 0)) + RETURN(rc); - isize = i_size_read(inode); + isize = i_size_read(inode); maxidx = ((isize + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT) - 1; if (osd->od_writethrough_cache) @@ -598,11 +637,10 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt, lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); if (iobuf->dr_npages) { - rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, - oti->oti_created, - 0, NULL); + rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, + iobuf->dr_npages, + iobuf->dr_blocks, + 0, NULL); if (likely(rc == 0)) { rc = osd_do_bio(osd, inode, iobuf); /* do IO stats for preparation reads */ @@ -754,8 +792,11 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); - osd_init_iobuf(osd, iobuf, 1); - isize = i_size_read(inode); + rc = osd_init_iobuf(osd, iobuf, 1, npages); + if (unlikely(rc != 0)) + RETURN(rc); + + isize = i_size_read(inode); ll_vfs_dq_init(inode); for (i = 0; i < npages; i++) { @@ -796,10 +837,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, rc = -ENOSPC; } else if (iobuf->dr_npages > 0) { rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, - oti->oti_created, - 1, NULL); + iobuf->dr_npages, + iobuf->dr_blocks, + 1, NULL); } else { /* no pages to write, no transno is needed */ thandle->th_local = 1; @@ -843,12 +883,14 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, LASSERT(inode); - osd_init_iobuf(osd, iobuf, 0); + rc = osd_init_iobuf(osd, iobuf, 0, npages); + if (unlikely(rc != 0)) + RETURN(rc); - if (osd->od_read_cache) - cache = 1; - if (i_size_read(inode) > osd->od_readcache_max_filesize) - cache = 0; + if (osd->od_read_cache) + cache = 1; + if (i_size_read(inode) > osd->od_readcache_max_filesize) + cache = 0; cfs_gettimeofday(&start); for (i = 0; i < npages; i++) { @@ -882,11 +924,10 @@ static int osd_read_prep(const struct lu_env *env, struct dt_object *dt, lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff); if (iobuf->dr_npages) { - rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, - iobuf->dr_npages, - iobuf->dr_blocks, - oti->oti_created, - 0, NULL); + rc = osd->od_fsops->fs_map_inode_pages(inode, iobuf->dr_pages, + iobuf->dr_npages, + iobuf->dr_blocks, + 0, NULL); rc = osd_do_bio(osd, inode, iobuf); /* IO stats will be done in osd_bufs_put() */ -- 1.8.3.1