From 4af03add5fd555ca0a82a5251b8b49a29ddac194 Mon Sep 17 00:00:00 2001 From: adilger Date: Thu, 1 Sep 2005 18:09:27 +0000 Subject: [PATCH] Branch b1_4 Description: 2.6 OST async journal commit and locking fix to improve performance Details : The filter_direct_io()+filter_commitrw_write() journal commits for 2.6 kernels are now async as they already were in 2.4 kernels so that they can commit concurrently with the network bulk transfer. For block-allocated files the filter allocation semaphore is held to avoid filesystem fragmentation during allocation. BKL lock removed for 2.6 xattr operations where it is no longer needed. b=7116 r=alex, tested at HP --- lustre/ChangeLog | 10 +++++++++ lustre/lvfs/fsfilt_ext3.c | 18 +++++++-------- lustre/obdfilter/filter.c | 4 ++-- lustre/obdfilter/filter_io_26.c | 49 +++++++++++++++++++++++------------------ 4 files changed, 49 insertions(+), 32 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 5b082cc..e788cd4 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -9,6 +9,16 @@ Description: Running on many-way SMP OSTs can trigger oops in llcd_send() Details : A race between allocating a new llcd and re-getting the llcd_lock allowed another thread to grab newly-allocated llcd. +Severity : enhancement +Bugzilla : 7116 +Description: 2.6 OST async journal commit and locking fix to improve performance +Details : The filter_direct_io()+filter_commitrw_write() journal commits for + 2.6 kernels are now async as they already were in 2.4 kernels so + that they can commit concurrently with the network bulk transfer. + For block-allocated files the filter allocation semaphore is held + to avoid filesystem fragmentation during allocation. BKL lock + removed for 2.6 xattr operations where it is no longer needed. + 08-26-2005 Cluster File Systems, Inc. * version 1.4.5 * bug fixes diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index a149191..458feb8 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -359,7 +359,7 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h, LASSERT(current->journal_info == handle); - lock_kernel(); + lock_24kernel(); transaction = handle->h_transaction; journal = transaction->t_journal; tid = transaction->t_tid; @@ -368,7 +368,7 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h, rc = journal_stop(handle); if (rc) { CERROR("error while stopping transaction: %d\n", rc); - unlock_kernel(); + unlock_24kernel(); return rc; } #if LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0) @@ -377,9 +377,9 @@ static int fsfilt_ext3_commit_async(struct inode *inode, void *h, CERROR("strange race: %lu != %lu\n", (unsigned long) tid, (unsigned long) rtid); #else - log_start_commit(journal, transaction->t_tid); + log_start_commit(journal, tid); #endif - unlock_kernel(); + unlock_24kernel(); *wait_handle = (void *) tid; CDEBUG(D_INODE, "commit async: %lu\n", (unsigned long) tid); @@ -473,11 +473,11 @@ static int fsfilt_ext3_set_md(struct inode *inode, void *handle, CWARN("setting EA on %lu/%u again... interesting\n", inode->i_ino, inode->i_generation); - lock_kernel(); + lock_24kernel(); rc = ext3_xattr_set_handle(handle, inode, EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size, 0); - unlock_kernel(); + unlock_24kernel(); if (rc) CERROR("error adding MD data to inode %lu: rc = %d\n", @@ -491,11 +491,11 @@ static int fsfilt_ext3_get_md(struct inode *inode, void *lmm, int lmm_size) int rc; LASSERT(down_trylock(&inode->i_sem) != 0); - lock_kernel(); + lock_24kernel(); rc = ext3_xattr_get(inode, EXT3_XATTR_INDEX_TRUSTED, XATTR_LUSTRE_MDS_LOV_EA, lmm, lmm_size); - unlock_kernel(); + unlock_24kernel(); /* This gives us the MD size */ if (lmm == NULL) @@ -764,7 +764,7 @@ static int ext3_ext_new_extent_cb(struct ext3_extents_tree *tree, EXT_ASSERT(i == path->p_depth); EXT_ASSERT(path[i].p_hdr); - if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) { + if (cex->ec_type == EXT3_EXT_CACHE_EXTENT) { err = EXT_CONTINUE; goto map; } diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 946bd5e..960b8d0 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -2169,7 +2169,7 @@ static int filter_statfs(struct obd_device *obd, struct obd_statfs *osfs, osfs->os_bavail -= min(osfs->os_bavail, (filter->fo_tot_dirty + filter->fo_tot_pending + - osfs->os_bsize -1) >> blockbits); + osfs->os_bsize - 1) >> blockbits); RETURN(rc); } @@ -2204,7 +2204,7 @@ static int filter_precreate(struct obd_device *obd, struct obdo *oa, OBD_ALLOC(osfs, sizeof(*osfs)); if (osfs == NULL) RETURN(-ENOMEM); - rc = filter_statfs(obd, osfs, jiffies-HZ); + rc = filter_statfs(obd, osfs, jiffies - HZ); if (rc == 0 && osfs->os_bavail < (osfs->os_blocks >> 10)) { CDEBUG(D_HA, "OST out of space! avail "LPU64"\n", osfs->os_bavail<fo_sb->s_blocksize_bits); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 1db5ff5..a712188 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -40,8 +40,6 @@ #include #include "filter_internal.h" -#warning "implement writeback mode -bzzz" - /* 512byte block min */ #define MAX_BLOCKS_PER_PAGE (PAGE_SIZE / 512) struct dio_request { @@ -51,7 +49,6 @@ struct dio_request { int dr_max_pages; int dr_npages; int dr_error; - unsigned long dr_flag; /* indicating if there is client cache page in this rpc */ struct page **dr_pages; unsigned long *dr_blocks; spinlock_t dr_lock; @@ -424,24 +421,29 @@ int filter_direct_io(int rw, struct dentry *dchild, void *iobuf, struct dio_request *dreq = iobuf; struct inode *inode = dchild->d_inode; int blocks_per_page = PAGE_SIZE >> inode->i_blkbits; - int rc, rc2; + int rc, rc2, create; + struct semaphore *sem; ENTRY; - LASSERTF(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ, "%x\n", rw); LASSERTF(dreq->dr_npages <= dreq->dr_max_pages, "%d,%d\n", dreq->dr_npages, dreq->dr_max_pages); LASSERT(dreq->dr_npages <= OBDFILTER_CREATED_SCRATCHPAD_ENTRIES); - LASSERT(dreq->dr_npages > 0 || rw != OBD_BRW_WRITE); - - if (dreq->dr_npages == 0) - RETURN(0); + if (rw == OBD_BRW_READ) { + if (dreq->dr_npages == 0) + RETURN(0); + create = 0; + sem = NULL; + } else { + LASSERTF(rw == OBD_BRW_WRITE, "%x\n", rw); + LASSERT(dreq->dr_npages > 0); + create = 1; + sem = &obd->u.filter.fo_alloc_lock; + } remap: - rc = fsfilt_map_inode_pages(obd, inode, - dreq->dr_pages, dreq->dr_npages, - dreq->dr_blocks, - obdfilter_created_scratchpad, - rw == OBD_BRW_WRITE, NULL); + rc = fsfilt_map_inode_pages(obd, inode, dreq->dr_pages, + dreq->dr_npages, dreq->dr_blocks, + obdfilter_created_scratchpad, create, sem); if (rc == -EDQUOT) { LASSERT(rw == OBD_BRW_WRITE && @@ -471,9 +473,13 @@ remap: up(&inode->i_sem); rc2 = filter_finish_transno(exp, oti, 0); - if (rc2 != 0) - CERROR("can't close transaction: %d\n", rc); + if (rc2 != 0) { + CERROR("can't close transaction: %d\n", rc2); + if (rc == 0) + rc = rc2; + } + rc2 =fsfilt_commit_async(obd,inode,oti->oti_handle,wait_handle); if (rc == 0) rc = rc2; if (rc != 0) @@ -524,6 +530,7 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, struct obd_device *obd = exp->exp_obd; struct filter_obd *filter = &obd->u.filter; struct lvfs_ucred *uc = NULL; + void *wait_handle; int total_size = 0; ENTRY; @@ -595,18 +602,18 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, iattr_from_obdo(&iattr,oa,OBD_MD_FLATIME|OBD_MD_FLMTIME|OBD_MD_FLCTIME); /* filter_direct_io drops i_sem */ rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, dreq, exp, &iattr, - oti, NULL); + oti, &wait_handle); if (rc == 0) - obdo_from_inode(oa, inode, - FILTER_VALID_FLAGS | OBD_MD_FLUID | OBD_MD_FLGID); - else + obdo_from_inode(oa, inode, + FILTER_VALID_FLAGS |OBD_MD_FLUID |OBD_MD_FLGID); + else obdo_from_inode(oa, inode, OBD_MD_FLUID | OBD_MD_FLGID); filter_get_quota_flag(obd, oa); fsfilt_check_slow(now, obd_timeout, "direct_io"); - err = fsfilt_commit(obd, inode, oti->oti_handle, obd_sync_filter); + err = fsfilt_commit_wait(obd, inode, wait_handle); if (err) rc = err; -- 1.8.3.1