From 6590d01c2a56276422eef4d94bb26eba37e3b6bf Mon Sep 17 00:00:00 2001 From: shadow Date: Mon, 30 Jun 2008 12:29:46 +0000 Subject: [PATCH] badly clear PG_Writeback bit in ll_ap_completion can produce false positive assertion Branch HEAD b=14742 i=nikita i=johan --- lustre/ChangeLog | 7 +++++ lustre/include/linux/lustre_compat25.h | 3 +- lustre/llite/rw.c | 51 ++++++++++++++++++++++------------ lustre/osc/osc_request.c | 21 +++++++++++--- 4 files changed, 60 insertions(+), 22 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index be61455..e33723b 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -12,6 +12,13 @@ tbd Sun Microsystems, Inc. * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a removed cwd "./" (refer to Bugzilla 14399). +Severity : normal +Bugzilla : 14742 +Frequency : rare +Description: ASSERTION(CheckWriteback(page,cmd)) failed +Details : badly clear PG_Writeback bit in ll_ap_completion can produce false + positive assertion. + Severity : enhancement Bugzilla : 15865 Description: Update to RHEL5 kernel-2.6.18-53.1.21.el5. diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 975ef22..c8a58f9 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -201,7 +201,8 @@ extern void __d_move(struct dentry *dentry, struct dentry *target); #endif #define CheckWriteback(page, cmd) \ - (!(!PageWriteback(page) && cmd == OBD_BRW_WRITE)) + ((!PageWriteback(page) && (cmd & OBD_BRW_READ)) || \ + (PageWriteback(page) && (cmd & OBD_BRW_WRITE))) #ifdef HAVE_PAGE_LIST static inline int mapping_has_pages(struct address_space *mapping) diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index dfe49d8..0393274 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -317,6 +317,14 @@ int ll_prepare_write(struct file *file, struct page *page, unsigned from, return rc; } +/** + * make page ready for ASYNC write + * \param data - pointer to llap cookie + * \param cmd - is OBD_BRW_* macroses + * + * \retval 0 is page successfully prepared to send + * \retval -EAGAIN is page not need to send + */ static int ll_ap_make_ready(void *data, int cmd) { struct ll_async_page *llap; @@ -326,14 +334,13 @@ static int ll_ap_make_ready(void *data, int cmd) llap = LLAP_FROM_COOKIE(data); page = llap->llap_page; - LASSERTF(!(cmd & OBD_BRW_READ), "cmd %x page %p ino %lu index %lu\n", cmd, page, - page->mapping->host->i_ino, page->index); - /* we're trying to write, but the page is locked.. come back later */ if (TryLockPage(page)) RETURN(-EAGAIN); - LASSERT(!PageWriteback(page)); + LASSERTF(!(cmd & OBD_BRW_READ) || !PageWriteback(page), + "cmd %x page %p ino %lu index %lu fl %lx\n", cmd, page, + page->mapping->host->i_ino, page->index, page->flags); /* if we left PageDirty we might get another writepage call * in the future. list walkers are bright enough @@ -344,7 +351,10 @@ static int ll_ap_make_ready(void *data, int cmd) * cli lock */ LASSERTF(!PageWriteback(page),"cmd %x page %p ino %lu index %lu\n", cmd, page, page->mapping->host->i_ino, page->index); - clear_page_dirty_for_io(page); + if(!clear_page_dirty_for_io(page)) { + unlock_page(page); + RETURN(-EAGAIN); + } /* This actually clears the dirty bit in the radix tree.*/ set_page_writeback(page); @@ -788,9 +798,8 @@ static int queue_or_sync_write(struct obd_export *exp, struct inode *inode, if (!rc && async_flags & ASYNC_READY) { unlock_page(llap->llap_page); - if (PageWriteback(llap->llap_page)) { + if (PageWriteback(llap->llap_page)) end_page_writeback(llap->llap_page); - } } if (rc == 0 && llap_write_complete(inode, llap)) @@ -960,6 +969,14 @@ int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) set_bit(AS_EIO, &page->mapping->flags); } + /* be carefull about clear WB. + * if WB will cleared after page lock is released - paralel IO can be + * started before ap_make_ready is finished - so we will be have page + * with PG_Writeback set from ->writepage() and completed READ which + * clear this flag */ + if ((cmd & OBD_BRW_WRITE) && PageWriteback(page)) + end_page_writeback(page); + unlock_page(page); if (cmd & OBD_BRW_WRITE) { @@ -970,9 +987,6 @@ int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) ll_queue_done_writing(page->mapping->host, 0); } - if (PageWriteback(page)) { - end_page_writeback(page); - } page_cache_release(page); RETURN(ret); @@ -1780,19 +1794,22 @@ int ll_writepage(struct page *page) rc = queue_or_sync_write(exp, inode, llap, CFS_PAGE_SIZE, ASYNC_READY | ASYNC_URGENT); } - if (rc) - page_cache_release(page); -out: if (rc) { - if (!lli->lli_async_rc) - lli->lli_async_rc = rc; /* re-dirty page on error so it retries write */ - if (PageWriteback(page)) { + if (PageWriteback(page)) end_page_writeback(page); - } + /* resend page only for not started IO*/ if (!PageError(page)) ll_redirty_page(page); + + page_cache_release(page); + } +out: + if (rc) { + if (!lli->lli_async_rc) + lli->lli_async_rc = rc; + /* resend page only for not started IO*/ unlock_page(page); } RETURN(rc); diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 679e78f..5e37f69 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -2171,6 +2171,17 @@ out: /* the loi lock is held across this function but it's allowed to release * and reacquire it during its work */ +/** + * prepare pages for ASYNC io and put pages in send queue. + * + * \param cli - + * \param loi - + * \param cmd - OBD_BRW_* macroses + * \param lop - pending pages + * + * \return zero if pages successfully add to send queue. + * \return not zere if error occurring. + */ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, int cmd, struct loi_oap_pages *lop) { @@ -2245,12 +2256,14 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, /* * Page submitted for IO has to be locked. Either by * ->ap_make_ready() or by higher layers. - * - * XXX nikita: this assertion should be adjusted when lustre - * starts using PG_writeback for pages being written out. */ #if defined(__KERNEL__) && defined(__linux__) - LASSERT(PageLocked(oap->oap_page)); + if(!(PageLocked(oap->oap_page) && + (CheckWriteback(oap->oap_page, cmd) || oap->oap_oig !=NULL))) { + CDEBUG(D_PAGE, "page %p lost wb %lx/%x\n", + oap->oap_page, (long)oap->oap_page->flags, oap->oap_async_flags); + LBUG(); + } #endif /* If there is a gap at the start of this page, it can't merge * with any previous page, so we'll hand the network a -- 1.8.3.1