X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fobdfilter%2Ffilter_io_26.c;h=cafb080abd6e98a1db502e76e4dd46eae2cf19ae;hb=51206e8cd42134400fa0b6259a92d7138f3dc984;hp=d6a356e5255dce3536bd27547af20855d84572c2;hpb=f95393b0d0a59cf3dc2f29cffc35dcc4cc9d7728;p=fs%2Flustre-release.git diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index d6a356e..cafb080 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -28,6 +28,8 @@ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. + * + * Copyright (c) 2011, 2012, Whamcloud, Inc. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -103,7 +105,7 @@ static void record_start_io(struct filter_iobuf *iobuf, int rw, int size, lprocfs_oh_tally(&exp->exp_nid_stats->nid_brw_stats-> hist[BRW_W_RPC_HIST], cfs_atomic_read(&filter-> \ - fo_r_in_flight)); + fo_w_in_flight)); lprocfs_oh_tally_log2(&exp->exp_nid_stats-> nid_brw_stats->hist[BRW_W_DISK_IOSIZE], size); @@ -127,7 +129,13 @@ static void record_finish_io(struct filter_iobuf *iobuf, int rw, int rc) cfs_waitq_signal(&iobuf->dr_wait); } +#ifdef HAVE_BIO_ENDIO_2ARG +#define DIO_RETURN(a) +static void dio_complete_routine(struct bio *bio, int error) +#else +#define DIO_RETURN(a) return(a) static int dio_complete_routine(struct bio *bio, unsigned int done, int error) +#endif { struct filter_iobuf *iobuf = bio->bi_private; struct bio_vec *bvl; @@ -136,14 +144,22 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) /* CAVEAT EMPTOR: possibly in IRQ context * DO NOT record procfs stats here!!! */ - if (bio->bi_size) /* Not complete */ - return 1; +#ifdef HAVE_BIO_ENDIO_2ARG + /* The "bi_size" check was needed for kernels < 2.6.24 in order to + * handle the case where a SCSI request error caused this callback + * to be called before all of the biovecs had been processed. + * Without this check the server thread will hang. In newer kernels + * the bio_end_io routine is never called for partial completions, + * so this check is no longer needed. */ + if (bio->bi_size) /* Not complete */ + DIO_RETURN(1); +#endif if (unlikely(iobuf == NULL)) { CERROR("***** bio->bi_private is NULL! This should never " "happen. Normally, I would crash here, but instead I " "will dump the bio contents to the console. Please " - "report this to , along " + "report this to , along " "with any interesting messages leading up to this point " "(like SCSI errors, perhaps). Because bi_private is " "NULL, I can't wake up the thread that initiated this " @@ -154,7 +170,7 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) bio->bi_rw, bio->bi_vcnt, bio->bi_idx, bio->bi_size, bio->bi_end_io, cfs_atomic_read(&bio->bi_cnt), bio->bi_private); - return 0; + DIO_RETURN(0); } /* the check is outside of the cycle for performance reason -bzzz */ @@ -185,7 +201,7 @@ static int dio_complete_routine(struct bio *bio, unsigned int done, int error) * deadlocking the OST. The bios are now released as soon as complete * so the pool cannot be exhausted while IOs are competing. bug 10076 */ bio_put(bio); - return 0; + DIO_RETURN(0); } static int can_be_merged(struct bio *bio, sector_t sector) @@ -356,19 +372,22 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode, continue; /* added this frag OK */ if (bio != NULL) { - request_queue_t *q = + struct request_queue *q = bdev_get_queue(bio->bi_bdev); /* Dang! I have to fragment this I/O */ CDEBUG(D_INODE, "bio++ sz %d vcnt %d(%d) " - "sectors %d(%d) psg %d(%d) hsg %d(%d)\n", + "sectors %d(%d) psg %d(%d) hsg %d(%d) " + "sector %llu next %llu\n", bio->bi_size, bio->bi_vcnt, bio->bi_max_vecs, - bio->bi_size >> 9, q->max_sectors, + bio->bi_size >> 9, queue_max_sectors(q), bio_phys_segments(q, bio), - q->max_phys_segments, + queue_max_phys_segments(q), bio_hw_segments(q, bio), - q->max_hw_segments); + queue_max_hw_segments(q), + (unsigned long long)bio->bi_sector, + (unsigned long long)sector); record_start_io(iobuf, rw, bio->bi_size, exp); rc = fsfilt_send_bio(rw, obd, inode, bio); @@ -381,7 +400,7 @@ int filter_do_bio(struct obd_export *exp, struct inode *inode, } /* allocate new bio, limited by max BIO size, b=9945 */ - bio = bio_alloc(GFP_NOIO, max(BIO_MAX_PAGES, + bio = bio_alloc(GFP_NOIO, min(BIO_MAX_PAGES, (npages - page_idx) * blocks_per_page)); if (bio == NULL) { @@ -464,7 +483,7 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, struct inode *inode = dchild->d_inode; int blocks_per_page = CFS_PAGE_SIZE >> inode->i_blkbits; int rc, rc2, create; - cfs_semaphore_t *sem; + cfs_mutex_t *mutex; ENTRY; LASSERTF(iobuf->dr_npages <= iobuf->dr_max_pages, "%d,%d\n", @@ -475,20 +494,25 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, if (iobuf->dr_npages == 0) RETURN(0); create = 0; - sem = NULL; + mutex = NULL; } else { LASSERTF(rw == OBD_BRW_WRITE, "%x\n", rw); LASSERT(iobuf->dr_npages > 0); create = 1; - sem = &obd->u.filter.fo_alloc_lock; + mutex = &obd->u.filter.fo_alloc_lock; lquota_enforce(filter_quota_interface_ref, obd, iobuf->dr_ignore_quota); } - rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages, + if (rw == OBD_BRW_WRITE && + OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) { + rc = -ENOSPC; + } else { + rc = fsfilt_map_inode_pages(obd, inode, iobuf->dr_pages, iobuf->dr_npages, iobuf->dr_blocks, - obdfilter_created_scratchpad, create, sem); + obdfilter_created_scratchpad, create, mutex); + } if (rw == OBD_BRW_WRITE) { if (rc == 0) { @@ -503,6 +527,13 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, UNLOCK_INODE_MUTEX(inode); + /* Force commit to make the just-deleted blocks + * reusable. LU-456 */ + if (rc == -ENOSPC) { + fsfilt_commit(obd, inode, oti->oti_handle, 1); + RETURN(rc); + } + rc2 = filter_finish_transno(exp, inode, oti, 0, 0); if (rc2 != 0) { CERROR("can't close transaction: %d\n", rc2); @@ -510,8 +541,11 @@ int filter_direct_io(int rw, struct dentry *dchild, struct filter_iobuf *iobuf, rc = rc2; } - rc2 = fsfilt_commit_async(obd,inode,oti->oti_handle, - wait_handle); + if (wait_handle) + rc2 = fsfilt_commit_async(obd,inode,oti->oti_handle, + wait_handle); + else + rc2 = fsfilt_commit(obd, inode, oti->oti_handle, 0); if (rc == 0) rc = rc2; if (rc != 0) @@ -572,10 +606,12 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, int i, err, cleanup_phase = 0; struct obd_device *obd = exp->exp_obd; struct filter_obd *fo = &obd->u.filter; - void *wait_handle; + void *wait_handle = NULL; int total_size = 0; unsigned int qcids[MAXQUOTAS] = { oa->o_uid, oa->o_gid }; int rec_pending[MAXQUOTAS] = { 0, 0 }, quota_pages = 0; + int sync_journal_commit = obd->u.filter.fo_syncjournal; + int retries = 0; ENTRY; LASSERT(oti != NULL); @@ -643,6 +679,10 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, (flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == OBD_BRW_FROM_GRANT) iobuf->dr_ignore_quota = 1; + + if (!(lnb->flags & OBD_BRW_ASYNC)) { + sync_journal_commit = 1; + } } /* we try to get enough quota to write here, and let ldiskfs @@ -653,11 +693,15 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, if (rc == -ENOTCONN) GOTO(cleanup, rc); + if (OBD_FAIL_CHECK(OBD_FAIL_OST_DQACQ_NET)) + GOTO(cleanup, rc = -EINPROGRESS); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); cleanup_phase = 2; - DQUOT_INIT(inode); + fsfilt_check_slow(obd, now, "quota init"); +retry: LOCK_INODE_MUTEX(inode); fsfilt_check_slow(obd, now, "i_mutex"); oti->oti_handle = fsfilt_brw_start(obd, objcount, &fso, niocount, res, @@ -674,6 +718,9 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, fsfilt_check_slow(obd, now, "brw_start"); + /* Locking order: i_mutex -> journal_lock -> dqptr_sem. LU-952 */ + ll_vfs_dq_init(inode); + i = OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME; /* If the inode still has SUID+SGID bits set (see filter_precreate()) @@ -715,25 +762,55 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, /* filter_direct_io drops i_mutex */ rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr, - oti, &wait_handle); + oti, sync_journal_commit ? &wait_handle : NULL); + if (rc == -ENOSPC && retries++ < 3) { + CDEBUG(D_INODE, "retry after force commit, retries:%d\n", + retries); + oti->oti_handle = NULL; + fsfilt_check_slow(obd, now, "direct_io"); + goto retry; + } - obdo_from_inode(oa, inode, NULL, rc == 0 ? FILTER_VALID_FLAGS : 0 | - OBD_MD_FLUID |OBD_MD_FLGID); + obdo_from_inode(oa, inode, (rc == 0 ? FILTER_VALID_FLAGS : 0) | + OBD_MD_FLUID | OBD_MD_FLGID); lquota_getflag(filter_quota_interface_ref, obd, oa); fsfilt_check_slow(obd, now, "direct_io"); - err = fsfilt_commit_wait(obd, inode, wait_handle); + if (wait_handle) + err = fsfilt_commit_wait(obd, inode, wait_handle); + else + err = 0; + if (err) { CERROR("Failure to commit OST transaction (%d)?\n", err); - rc = err; + if (rc == 0) + rc = err; } - if (obd->obd_replayable && !rc) - LASSERTF(oti->oti_transno <= obd->obd_last_committed, - "oti_transno "LPU64" last_committed "LPU64"\n", - oti->oti_transno, obd->obd_last_committed); + /* In rare cases fsfilt_commit_wait() will wake up and return after + * the transaction has finished its work and updated j_commit_sequence + * but the commit callbacks have not been run yet. Wait here until + * that is finished so that clients requesting sync IO don't see the + * reply transno < last_committed. LU-753 */ + if (unlikely(obd->obd_replayable && !rc && wait_handle && + oti->oti_transno > obd->obd_last_committed)) { + cfs_waitq_t wq; + struct l_wait_info lwi = + LWI_TIMEOUT_INTERVAL(cfs_time_seconds(5), + (cfs_duration_t)((HZ + 4)/5), + NULL, NULL); + cfs_waitq_init(&wq); + l_wait_event(wq, + oti->oti_transno <= obd->obd_last_committed, + &lwi); + + /* commit callback isn't done after waiting for 5 secs ? */ + if (unlikely(oti->oti_transno > obd->obd_last_committed)) + CERROR("transno:"LPU64" > last_committed:"LPU64"\n", + oti->oti_transno, obd->obd_last_committed); + } fsfilt_check_slow(obd, now, "commitrw commit");