From b08e26058917c2852af0a1fcb349deb63ecd36d5 Mon Sep 17 00:00:00 2001 From: nic Date: Tue, 2 Nov 2004 17:39:02 +0000 Subject: [PATCH] land b1_2_bug2248 on b1_4 - return async write errors to application if possible (2248) --- lustre/ChangeLog | 6 ++++ lustre/include/linux/lustre_lite.h | 3 ++ lustre/include/linux/lustre_net.h | 2 ++ lustre/include/linux/obd.h | 11 +++++++ lustre/llite/file.c | 23 ++++++++++++- lustre/llite/rw24.c | 10 ++++-- lustre/llite/rw26.c | 12 ++++--- lustre/lov/lov_obd.c | 16 +++++++++ lustre/osc/osc_request.c | 43 ++++++++++++++++++------- lustre/portals/include/linux/portals_compat25.h | 11 +++++-- lustre/portals/libcfs/debug.c | 5 ++- lustre/ptlrpc/client.c | 16 ++++++++- lustre/tests/recovery-small.sh | 15 +++++++++ 13 files changed, 147 insertions(+), 26 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index dc192d1..95e485a 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,8 +1,13 @@ tbd Cluster File Systems, Inc. * version 1.3.4 * bug fixes + - flock/lockf fixes + - don't use EXT3 constants in llite code (5094) + - return async write errors to application if possible (2248) * miscellania - reorganization of lov code + - single portals codebase + - Infiniband NAL tbd Cluster File Systems, Inc. * version 1.2.8 @@ -27,6 +32,7 @@ tbd Cluster File Systems, Inc. - add software watchdogs to catch hung threads quickly (4941) - make lustrefs init script start after nfs is mounted - fix CWARN/ERROR duplication (4930) + - add /proc/sys/portal/memused (bytes allocated by PORTALS_ALLOC) - print NAL number in %x format (4645) 2004-10-07 Cluster File Systems, Inc. diff --git a/lustre/include/linux/lustre_lite.h b/lustre/include/linux/lustre_lite.h index 69b8d51..90c94bb 100644 --- a/lustre/include/linux/lustre_lite.h +++ b/lustre/include/linux/lustre_lite.h @@ -92,6 +92,9 @@ struct ll_inode_info { struct list_head lli_close_item; + /* for writepage() only to communicate to fsync */ + int lli_async_rc; + struct file_operations *ll_save_ifop; struct file_operations *ll_save_ffop; struct file_operations *ll_save_wfop; diff --git a/lustre/include/linux/lustre_net.h b/lustre/include/linux/lustre_net.h index 30a2780..335cbc5 100644 --- a/lustre/include/linux/lustre_net.h +++ b/lustre/include/linux/lustre_net.h @@ -638,6 +638,8 @@ void ptlrpc_prep_bulk_page(struct ptlrpc_bulk_desc *desc, void ptlrpc_retain_replayable_request(struct ptlrpc_request *req, struct obd_import *imp); __u64 ptlrpc_next_xid(void); +__u64 ptlrpc_sample_next_xid(void); +__u64 ptlrpc_req_xid(struct ptlrpc_request *request); /* ptlrpc/service.c */ void ptlrpc_save_lock (struct ptlrpc_request *req, diff --git a/lustre/include/linux/obd.h b/lustre/include/linux/obd.h index 7f01b9b..141d5a2 100644 --- a/lustre/include/linux/obd.h +++ b/lustre/include/linux/obd.h @@ -44,6 +44,12 @@ struct loi_oap_pages { struct list_head lop_pending_group; }; +struct osc_async_rc { + int ar_rc; + int ar_force_sync; + int ar_min_xid; +}; + struct lov_oinfo { /* per-stripe data structure */ __u64 loi_id; /* object ID on the target OST */ __u64 loi_gr; /* object group on the target OST */ @@ -63,6 +69,8 @@ struct lov_oinfo { /* per-stripe data structure */ __u64 loi_rss; /* recently seen size */ __u64 loi_mtime; /* recently seen mtime */ __u64 loi_blocks; /* recently seen blocks */ + + struct osc_async_rc loi_ar; }; static inline void loi_init(struct lov_oinfo *loi) @@ -263,6 +271,9 @@ struct client_obd { struct mdc_rpc_lock *cl_rpc_lock; struct mdc_rpc_lock *cl_setattr_lock; struct osc_creator cl_oscc; + + /* also protected by the poorly named _loi_list_lock lock above */ + struct osc_async_rc cl_ar; }; /* Like a client, with some hangers-on. Keep mc_client_obd first so that we diff --git a/lustre/llite/file.c b/lustre/llite/file.c index 5bd0eab..c06de8c 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -78,6 +78,8 @@ int ll_mdc_close(struct obd_export *mdc_exp, struct inode *inode, RETURN(rc); } +int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm); + /* While this returns an error code, fput() the caller does not, so we need * to make every effort to clean up all of our state here. Also, applications * rarely check close errors and even if an error is returned they will not @@ -87,6 +89,8 @@ int ll_file_release(struct inode *inode, struct file *file) { struct ll_file_data *fd; struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_inode_info *lli = ll_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; int rc; ENTRY; @@ -101,6 +105,10 @@ int ll_file_release(struct inode *inode, struct file *file) fd = (struct ll_file_data *)file->private_data; LASSERT(fd != NULL); + if (lsm) + lov_test_and_clear_async_rc(lsm); + lli->lli_async_rc = 0; + rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file); RETURN(rc); } @@ -1138,7 +1146,8 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) int ll_fsync(struct file *file, struct dentry *dentry, int data) { struct inode *inode = dentry->d_inode; - struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; + struct ll_inode_info *lli = ll_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; struct ll_fid fid; struct ptlrpc_request *req; int rc, err; @@ -1152,6 +1161,18 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data) * that IO to finish before calling the osc and mdc sync methods */ rc = filemap_fdatawait(inode->i_mapping); + /* catch async errors that were recorded back when async writeback + * failed for pages in this mapping. */ + err = lli->lli_async_rc; + lli->lli_async_rc = 0; + if (rc == 0) + rc = err; + if (lsm) { + err = lov_test_and_clear_async_rc(lsm); + if (rc == 0) + rc = err; + } + ll_inode2fid(&fid, inode); err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req); if (!rc) diff --git a/lustre/llite/rw24.c b/lustre/llite/rw24.c index 9c4e2a2..736caf3 100644 --- a/lustre/llite/rw24.c +++ b/lustre/llite/rw24.c @@ -52,6 +52,7 @@ static int ll_writepage_24(struct page *page) { struct inode *inode = page->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); struct obd_export *exp; struct ll_async_page *llap; int rc = 0; @@ -71,12 +72,12 @@ static int ll_writepage_24(struct page *page) page_cache_get(page); if (llap->llap_write_queued) { LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, + rc = obd_set_async_flags(exp, lli->lli_smd, NULL, llap->llap_cookie, ASYNC_READY | ASYNC_URGENT); } else { llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, + rc = obd_queue_async_io(exp, lli->lli_smd, NULL, llap->llap_cookie, OBD_BRW_WRITE, 0, 0, 0, ASYNC_READY | ASYNC_URGENT); if (rc == 0) @@ -87,8 +88,11 @@ static int ll_writepage_24(struct page *page) if (rc) page_cache_release(page); out: - if (rc) + if (rc) { + if (!lli->lli_async_rc) + lli->lli_async_rc = rc; unlock_page(page); + } RETURN(rc); } diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index b585b09..bade134 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -54,6 +54,7 @@ static int ll_writepage_26(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; + struct ll_inode_info *lli = ll_i2info(inode); struct obd_export *exp; struct ll_async_page *llap; int rc; @@ -73,12 +74,12 @@ static int ll_writepage_26(struct page *page, struct writeback_control *wbc) page_cache_get(page); if (llap->llap_write_queued) { LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL, + rc = obd_set_async_flags(exp, lli->lli_smd, NULL, llap->llap_cookie, ASYNC_READY | ASYNC_URGENT); } else { llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, + rc = obd_queue_async_io(exp, lli->lli_smd, NULL, llap->llap_cookie, OBD_BRW_WRITE, 0, 0, 0, ASYNC_READY | ASYNC_URGENT); if (rc == 0) @@ -89,10 +90,13 @@ static int ll_writepage_26(struct page *page, struct writeback_control *wbc) if (rc) page_cache_release(page); out: - if (rc) + if (rc) { + if (!lli->lli_async_rc) + lli->lli_async_rc = rc; unlock_page(page); - else + } else { set_page_writeback(page); + } RETURN(rc); } diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 5ca8bf3..8afa23d 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -1662,6 +1662,22 @@ static int lov_set_info(struct obd_export *exp, obd_count keylen, #undef KEY_IS } +int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm) +{ + struct lov_oinfo *loi; + int i, rc = 0; + ENTRY; + + for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; + i++, loi++) { + if (loi->loi_ar.ar_rc && !rc) + rc = loi->loi_ar.ar_rc; + loi->loi_ar.ar_rc = 0; + } + RETURN(rc); +} +EXPORT_SYMBOL(lov_test_and_clear_async_rc); + #if 0 struct lov_multi_wait { struct ldlm_lock *lock; diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 9e7f5e6..ff1dca9 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1180,6 +1180,28 @@ unlock: spin_unlock(&oap->oap_cli->cl_loi_list_lock); } +/* this is trying to propogate async writeback errors back up to the + * application. As an async write fails we record the error code for later if + * the app does an fsync. as long as errors persist we force future rpcs to be + * sync so that the app can get a sync error and break the cycle of queueing + * pages for which writeback will fail. */ +static void osc_process_ar(struct osc_async_rc *ar, struct ptlrpc_request *req, + int rc) +{ + if (rc) { + if (!ar->ar_rc) + ar->ar_rc = rc; + + ar->ar_force_sync = 1; + ar->ar_min_xid = ptlrpc_sample_next_xid(); + return; + + } + + if (ar->ar_force_sync && (ptlrpc_req_xid(req) >= ar->ar_min_xid)) + ar->ar_force_sync = 0; +} + /* this must be called holding the loi list lock to give coverage to exit_cache, * async_flag maintenance, and oap_request */ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, @@ -1190,6 +1212,12 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, oap->oap_interrupted = 0; if (oap->oap_request != NULL) { + if (sent && oap->oap_cmd == OBD_BRW_WRITE) { + osc_process_ar(&cli->cl_ar, oap->oap_request, rc); + osc_process_ar(&oap->oap_loi->loi_ar, + oap->oap_request, rc); + } + ptlrpc_req_finished(oap->oap_request); oap->oap_request = NULL; } @@ -1220,7 +1248,6 @@ static int brw_interpret_oap(struct ptlrpc_request *request, struct list_head *pos, *n; ENTRY; - rc = osc_brw_fini_request(request, aa->aa_oa, aa->aa_requested_nob, aa->aa_nio_count, aa->aa_page_count, aa->aa_pga, rc); @@ -1228,15 +1255,6 @@ static int brw_interpret_oap(struct ptlrpc_request *request, CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc); cli = aa->aa_cli; - /* in failout recovery we ignore writeback failure and want - * to just tell llite to unlock the page and continue */ - if (request->rq_reqmsg->opc == OST_WRITE && - (cli->cl_import == NULL || cli->cl_import->imp_invalid)) { - CDEBUG(D_INODE, "flipping to rc 0 imp %p inv %d\n", - cli->cl_import, - cli->cl_import ? cli->cl_import->imp_invalid : -1); - rc = 0; - } spin_lock(&cli->cl_loi_list_lock); @@ -1725,7 +1743,10 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi, cli->cl_dirty, cli->cl_dirty_max, cli->cl_lost_grant, cli->cl_avail_grant); - if (cli->cl_dirty_max < PAGE_SIZE) + /* force the caller to try sync io. this can jump the list + * of queued writes and create a discontiguous rpc stream */ + if (cli->cl_dirty_max < PAGE_SIZE || cli->cl_ar.ar_force_sync || + loi->loi_ar.ar_force_sync) return(-EDQUOT); /* Hopefully normal case - cache space and write credits available */ diff --git a/lustre/portals/include/linux/portals_compat25.h b/lustre/portals/include/linux/portals_compat25.h index e4831aa..fa2709e 100644 --- a/lustre/portals/include/linux/portals_compat25.h +++ b/lustre/portals/include/linux/portals_compat25.h @@ -60,11 +60,16 @@ #endif #if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20)) -# define THREAD_NAME(comm, len, fmt, a...) \ - snprintf(comm, len, fmt "|%d", ## a, current->thread.extern_pid) +#define UML_PID(tsk) ((tsk)->thread.extern_pid) #elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) +#define UML_PID(tsk) ((tsk)->thread.mode.tt.extern_pid) +#else +#define UML_PID(tsk) ((tsk)->pid) +#endif + +#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0)) # define THREAD_NAME(comm, len, fmt, a...) \ - snprintf(comm, len,fmt"|%d", ## a,current->thread.mode.tt.extern_pid) + snprintf(comm, len,fmt"|%d", ## a, UML_PID(current)) #else # define THREAD_NAME(comm, len, fmt, a...) \ snprintf(comm, len, fmt, ## a) diff --git a/lustre/portals/libcfs/debug.c b/lustre/portals/libcfs/debug.c index 1e81801..8b4b335 100644 --- a/lustre/portals/libcfs/debug.c +++ b/lustre/portals/libcfs/debug.c @@ -297,13 +297,12 @@ char *portals_id2str(int nal, ptl_process_id_t id, char *str) #ifdef __KERNEL__ - void portals_debug_dumpstack(struct task_struct *tsk) { #if defined(__arch_um__) if (tsk != NULL) - CWARN("stack dump for process %d requested; I'll wake up gdb.\n", - tsk->pid); + CWARN("stack dump for pid %d (%d) requested; wake up gdb.\n", + tsk->pid, UML_PID(tsk)); asm("int $3"); #elif defined(HAVE_SHOW_TASK) /* this is exported by lustre kernel version 42 */ diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index b82c5ce..2bd4f07 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -1097,6 +1097,12 @@ void ptlrpc_req_finished(struct ptlrpc_request *request) __ptlrpc_req_finished(request, 0); } +__u64 ptlrpc_req_xid(struct ptlrpc_request *request) +{ + return request->rq_xid; +} +EXPORT_SYMBOL(ptlrpc_req_xid); + /* Disengage the client's reply buffer from the network * NB does _NOT_ unregister any client-side bulk. * IDEMPOTENT, but _not_ safe against concurrent callers. @@ -1664,4 +1670,12 @@ __u64 ptlrpc_next_xid(void) return tmp; } - +__u64 ptlrpc_sample_next_xid(void) +{ + __u64 tmp; + spin_lock(&ptlrpc_last_xid_lock); + tmp = ptlrpc_last_xid + 1; + spin_unlock(&ptlrpc_last_xid_lock); + return tmp; +} +EXPORT_SYMBOL(ptlrpc_sample_next_xid); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index db2b19f..6865e6c 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -363,4 +363,19 @@ test_20b() { # bug 2986 - ldlm_handle_enqueue error during open } run_test 20b "ldlm_handle_enqueue error (should return error)" +test_21() { # bug 3267 - eviction fails writeback but app doesn't see it + mkdir -p $DIR/$tdir + cancel_lru_locks OSC + multiop $DIR/$tdir/$tfile Owyw_yc & + MULTI_PID=$! + usleep 500 +# OBD_FAIL_PTLRPC_BULK_PUT_NET|OBD_FAIL_ONCE + sysctl -w lustre.fail_loc=0x80000503 + kill -USR1 $MULTI_PID + wait $MULTI_PID + rc=$? + [ $rc -eq 0 ] && error "multiop didn't fail fsync: rc $rc" || true +} +run_test 21 "fsync error (should return error)" + $CLEANUP -- 1.8.3.1