tbd Cluster File Systems, Inc. <info@clusterfs.com>
* version 1.3.4
* bug fixes
+ - flock/lockf fixes
+ - don't use EXT3 constants in llite code (5094)
+ - return async write errors to application if possible (2248)
* miscellania
- reorganization of lov code
+ - single portals codebase
+ - Infiniband NAL
tbd Cluster File Systems, Inc. <info@clusterfs.com>
* version 1.2.8
- add software watchdogs to catch hung threads quickly (4941)
- make lustrefs init script start after nfs is mounted
- fix CWARN/ERROR duplication (4930)
+ - add /proc/sys/portal/memused (bytes allocated by PORTALS_ALLOC)
- print NAL number in %x format (4645)
2004-10-07 Cluster File Systems, Inc. <info@clusterfs.com>
struct list_head lli_close_item;
+ /* for writepage() only to communicate to fsync */
+ int lli_async_rc;
+
struct file_operations *ll_save_ifop;
struct file_operations *ll_save_ffop;
struct file_operations *ll_save_wfop;
void ptlrpc_retain_replayable_request(struct ptlrpc_request *req,
struct obd_import *imp);
__u64 ptlrpc_next_xid(void);
+__u64 ptlrpc_sample_next_xid(void);
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request);
/* ptlrpc/service.c */
void ptlrpc_save_lock (struct ptlrpc_request *req,
struct list_head lop_pending_group;
};
+struct osc_async_rc {
+ int ar_rc;
+ int ar_force_sync;
+ int ar_min_xid;
+};
+
struct lov_oinfo { /* per-stripe data structure */
__u64 loi_id; /* object ID on the target OST */
__u64 loi_gr; /* object group on the target OST */
__u64 loi_rss; /* recently seen size */
__u64 loi_mtime; /* recently seen mtime */
__u64 loi_blocks; /* recently seen blocks */
+
+ struct osc_async_rc loi_ar;
};
static inline void loi_init(struct lov_oinfo *loi)
struct mdc_rpc_lock *cl_rpc_lock;
struct mdc_rpc_lock *cl_setattr_lock;
struct osc_creator cl_oscc;
+
+ /* also protected by the poorly named _loi_list_lock lock above */
+ struct osc_async_rc cl_ar;
};
/* Like a client, with some hangers-on. Keep mc_client_obd first so that we
RETURN(rc);
}
+int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm);
+
/* While this returns an error code, fput() the caller does not, so we need
* to make every effort to clean up all of our state here. Also, applications
* rarely check close errors and even if an error is returned they will not
{
struct ll_file_data *fd;
struct ll_sb_info *sbi = ll_i2sbi(inode);
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lov_stripe_md *lsm = lli->lli_smd;
int rc;
ENTRY;
fd = (struct ll_file_data *)file->private_data;
LASSERT(fd != NULL);
+ if (lsm)
+ lov_test_and_clear_async_rc(lsm);
+ lli->lli_async_rc = 0;
+
rc = ll_mdc_close(sbi->ll_mdc_exp, inode, file);
RETURN(rc);
}
int ll_fsync(struct file *file, struct dentry *dentry, int data)
{
struct inode *inode = dentry->d_inode;
- struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd;
+ struct ll_inode_info *lli = ll_i2info(inode);
+ struct lov_stripe_md *lsm = lli->lli_smd;
struct ll_fid fid;
struct ptlrpc_request *req;
int rc, err;
* that IO to finish before calling the osc and mdc sync methods */
rc = filemap_fdatawait(inode->i_mapping);
+ /* catch async errors that were recorded back when async writeback
+ * failed for pages in this mapping. */
+ err = lli->lli_async_rc;
+ lli->lli_async_rc = 0;
+ if (rc == 0)
+ rc = err;
+ if (lsm) {
+ err = lov_test_and_clear_async_rc(lsm);
+ if (rc == 0)
+ rc = err;
+ }
+
ll_inode2fid(&fid, inode);
err = mdc_sync(ll_i2sbi(inode)->ll_mdc_exp, &fid, &req);
if (!rc)
static int ll_writepage_24(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct ll_inode_info *lli = ll_i2info(inode);
struct obd_export *exp;
struct ll_async_page *llap;
int rc = 0;
page_cache_get(page);
if (llap->llap_write_queued) {
LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
- rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
+ rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
llap->llap_cookie,
ASYNC_READY | ASYNC_URGENT);
} else {
llap->llap_write_queued = 1;
- rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
+ rc = obd_queue_async_io(exp, lli->lli_smd, NULL,
llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
0, ASYNC_READY | ASYNC_URGENT);
if (rc == 0)
if (rc)
page_cache_release(page);
out:
- if (rc)
+ if (rc) {
+ if (!lli->lli_async_rc)
+ lli->lli_async_rc = rc;
unlock_page(page);
+ }
RETURN(rc);
}
static int ll_writepage_26(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
+ struct ll_inode_info *lli = ll_i2info(inode);
struct obd_export *exp;
struct ll_async_page *llap;
int rc;
page_cache_get(page);
if (llap->llap_write_queued) {
LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n");
- rc = obd_set_async_flags(exp, ll_i2info(inode)->lli_smd, NULL,
+ rc = obd_set_async_flags(exp, lli->lli_smd, NULL,
llap->llap_cookie,
ASYNC_READY | ASYNC_URGENT);
} else {
llap->llap_write_queued = 1;
- rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
+ rc = obd_queue_async_io(exp, lli->lli_smd, NULL,
llap->llap_cookie, OBD_BRW_WRITE, 0, 0,
0, ASYNC_READY | ASYNC_URGENT);
if (rc == 0)
if (rc)
page_cache_release(page);
out:
- if (rc)
+ if (rc) {
+ if (!lli->lli_async_rc)
+ lli->lli_async_rc = rc;
unlock_page(page);
- else
+ } else {
set_page_writeback(page);
+ }
RETURN(rc);
}
#undef KEY_IS
}
+int lov_test_and_clear_async_rc(struct lov_stripe_md *lsm)
+{
+ struct lov_oinfo *loi;
+ int i, rc = 0;
+ ENTRY;
+
+ for (i = 0, loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count;
+ i++, loi++) {
+ if (loi->loi_ar.ar_rc && !rc)
+ rc = loi->loi_ar.ar_rc;
+ loi->loi_ar.ar_rc = 0;
+ }
+ RETURN(rc);
+}
+EXPORT_SYMBOL(lov_test_and_clear_async_rc);
+
#if 0
struct lov_multi_wait {
struct ldlm_lock *lock;
spin_unlock(&oap->oap_cli->cl_loi_list_lock);
}
+/* this is trying to propogate async writeback errors back up to the
+ * application. As an async write fails we record the error code for later if
+ * the app does an fsync. as long as errors persist we force future rpcs to be
+ * sync so that the app can get a sync error and break the cycle of queueing
+ * pages for which writeback will fail. */
+static void osc_process_ar(struct osc_async_rc *ar, struct ptlrpc_request *req,
+ int rc)
+{
+ if (rc) {
+ if (!ar->ar_rc)
+ ar->ar_rc = rc;
+
+ ar->ar_force_sync = 1;
+ ar->ar_min_xid = ptlrpc_sample_next_xid();
+ return;
+
+ }
+
+ if (ar->ar_force_sync && (ptlrpc_req_xid(req) >= ar->ar_min_xid))
+ ar->ar_force_sync = 0;
+}
+
/* this must be called holding the loi list lock to give coverage to exit_cache,
* async_flag maintenance, and oap_request */
static void osc_ap_completion(struct client_obd *cli, struct obdo *oa,
oap->oap_interrupted = 0;
if (oap->oap_request != NULL) {
+ if (sent && oap->oap_cmd == OBD_BRW_WRITE) {
+ osc_process_ar(&cli->cl_ar, oap->oap_request, rc);
+ osc_process_ar(&oap->oap_loi->loi_ar,
+ oap->oap_request, rc);
+ }
+
ptlrpc_req_finished(oap->oap_request);
oap->oap_request = NULL;
}
struct list_head *pos, *n;
ENTRY;
-
rc = osc_brw_fini_request(request, aa->aa_oa, aa->aa_requested_nob,
aa->aa_nio_count, aa->aa_page_count,
aa->aa_pga, rc);
CDEBUG(D_INODE, "request %p aa %p rc %d\n", request, aa, rc);
cli = aa->aa_cli;
- /* in failout recovery we ignore writeback failure and want
- * to just tell llite to unlock the page and continue */
- if (request->rq_reqmsg->opc == OST_WRITE &&
- (cli->cl_import == NULL || cli->cl_import->imp_invalid)) {
- CDEBUG(D_INODE, "flipping to rc 0 imp %p inv %d\n",
- cli->cl_import,
- cli->cl_import ? cli->cl_import->imp_invalid : -1);
- rc = 0;
- }
spin_lock(&cli->cl_loi_list_lock);
cli->cl_dirty, cli->cl_dirty_max, cli->cl_lost_grant,
cli->cl_avail_grant);
- if (cli->cl_dirty_max < PAGE_SIZE)
+ /* force the caller to try sync io. this can jump the list
+ * of queued writes and create a discontiguous rpc stream */
+ if (cli->cl_dirty_max < PAGE_SIZE || cli->cl_ar.ar_force_sync ||
+ loi->loi_ar.ar_force_sync)
return(-EDQUOT);
/* Hopefully normal case - cache space and write credits available */
#endif
#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,20))
-# define THREAD_NAME(comm, len, fmt, a...) \
- snprintf(comm, len, fmt "|%d", ## a, current->thread.extern_pid)
+#define UML_PID(tsk) ((tsk)->thread.extern_pid)
#elif defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
+#define UML_PID(tsk) ((tsk)->thread.mode.tt.extern_pid)
+#else
+#define UML_PID(tsk) ((tsk)->pid)
+#endif
+
+#if defined(__arch_um__) && (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,0))
# define THREAD_NAME(comm, len, fmt, a...) \
- snprintf(comm, len,fmt"|%d", ## a,current->thread.mode.tt.extern_pid)
+ snprintf(comm, len,fmt"|%d", ## a, UML_PID(current))
#else
# define THREAD_NAME(comm, len, fmt, a...) \
snprintf(comm, len, fmt, ## a)
#ifdef __KERNEL__
-
void portals_debug_dumpstack(struct task_struct *tsk)
{
#if defined(__arch_um__)
if (tsk != NULL)
- CWARN("stack dump for process %d requested; I'll wake up gdb.\n",
- tsk->pid);
+ CWARN("stack dump for pid %d (%d) requested; wake up gdb.\n",
+ tsk->pid, UML_PID(tsk));
asm("int $3");
#elif defined(HAVE_SHOW_TASK)
/* this is exported by lustre kernel version 42 */
__ptlrpc_req_finished(request, 0);
}
+__u64 ptlrpc_req_xid(struct ptlrpc_request *request)
+{
+ return request->rq_xid;
+}
+EXPORT_SYMBOL(ptlrpc_req_xid);
+
/* Disengage the client's reply buffer from the network
* NB does _NOT_ unregister any client-side bulk.
* IDEMPOTENT, but _not_ safe against concurrent callers.
return tmp;
}
-
+__u64 ptlrpc_sample_next_xid(void)
+{
+ __u64 tmp;
+ spin_lock(&ptlrpc_last_xid_lock);
+ tmp = ptlrpc_last_xid + 1;
+ spin_unlock(&ptlrpc_last_xid_lock);
+ return tmp;
+}
+EXPORT_SYMBOL(ptlrpc_sample_next_xid);
}
run_test 20b "ldlm_handle_enqueue error (should return error)"
+test_21() { # bug 3267 - eviction fails writeback but app doesn't see it
+ mkdir -p $DIR/$tdir
+ cancel_lru_locks OSC
+ multiop $DIR/$tdir/$tfile Owyw_yc &
+ MULTI_PID=$!
+ usleep 500
+# OBD_FAIL_PTLRPC_BULK_PUT_NET|OBD_FAIL_ONCE
+ sysctl -w lustre.fail_loc=0x80000503
+ kill -USR1 $MULTI_PID
+ wait $MULTI_PID
+ rc=$?
+ [ $rc -eq 0 ] && error "multiop didn't fail fsync: rc $rc" || true
+}
+run_test 21 "fsync error (should return error)"
+
$CLEANUP