Descriptoin: aborting recovery hang on MDS
Details : don't throttle destroy RPCs for the MDT.
+Severity : enhancement
+Bugzilla : 16919
+Descriptoin: Don't sync journal after every i/o
+Details : Implement write RPC replay to allow server replies for write RPCs
+ before data is on disk.
+
-------------------------------------------------------------------------------
2008-12-31 Sun Microsystems, Inc.
* version 1.8.0
#define OBD_BRW_DROP 0x80 /* drop the page after IO */
#define OBD_BRW_NOQUOTA 0x100
#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */
+#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */
#define OBD_OBJECT_EOF 0xffffffffffffffffULL
int fo_fmd_max_num; /* per exp filter_mod_data */
int fo_fmd_max_age; /* jiffies to fmd expiry */
+ int fo_syncjournal; /* sync journal on writes */
struct llog_commit_master *fo_lcm;
};
struct ptlrpc_service *ost_create_service;
struct ptlrpc_service *ost_io_service;
struct semaphore ost_health_sem;
+ int ost_sync_on_lock_cancel;
};
struct echo_client_obd {
#define OBD_RECOVERY_FACTOR (3) /* times obd_timeout */
/* Change recovery-small 26b time if you change this */
#define PING_INTERVAL max(obd_timeout / 4, 1U)
+/* a bit more than maximal journal commit time in seconds */
+#define PING_INTERVAL_SHORT 7
/* Client may skip 1 ping; we must wait at least 2.5. But for multiple
* failover targets the client only pings one server at a time, and pings
* can be lost on a loaded network. Since eviction has serious consequences,
struct obd_io_group *oig;
struct ll_sb_info *sbi = ll_i2sbi(inode);
int rc, noquot = llap->llap_ignore_quota ? OBD_BRW_NOQUOTA : 0;
+ int brwflags = OBD_BRW_ASYNC;
ENTRY;
/* _make_ready only sees llap once we've unlocked the page */
llap->llap_write_queued = 1;
rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL,
llap->llap_cookie, OBD_BRW_WRITE | noquot,
- 0, 0, 0, async_flags);
+ 0, 0, brwflags, async_flags);
if (rc == 0) {
LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n");
llap_write_pending(inode, llap);
filter->fo_readcache_max_filesize = FILTER_MAX_CACHE_SIZE;
filter->fo_fmd_max_num = FILTER_FMD_MAX_NUM_DEFAULT;
filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT;
+ filter->fo_syncjournal = 0; /* Don't sync journals on i/o by default */
rc = filter_prep(obd);
if (rc)
ENTRY;
if (oa->o_id > filter_last_id(&obd->u.filter, oa->o_gr)) {
- CERROR("recreate objid "LPU64" > last id "LPU64"\n",
- oa->o_id, filter_last_id(&obd->u.filter, oa->o_gr));
- RETURN(-EINVAL);
- }
-
- if ((oa->o_valid & OBD_MD_FLFLAGS) == 0) {
- oa->o_valid |= OBD_MD_FLFLAGS;
- oa->o_flags = OBD_FL_RECREATE_OBJS;
+ if (!obd->obd_recovering ||
+ oa->o_id > filter_last_id(&obd->u.filter, oa->o_gr) +
+ OST_MAX_PRECREATE) {
+ CERROR("recreate objid "LPU64" > last id "LPU64"\n",
+ oa->o_id, filter_last_id(&obd->u.filter,
+ oa->o_gr));
+ RETURN(-EINVAL);
+ }
+ diff = oa->o_id - filter_last_id(&obd->u.filter, oa->o_gr);
} else {
- oa->o_flags |= OBD_FL_RECREATE_OBJS;
+ if ((oa->o_valid & OBD_MD_FLFLAGS) == 0) {
+ oa->o_valid |= OBD_MD_FLFLAGS;
+ oa->o_flags = OBD_FL_RECREATE_OBJS;
+ } else {
+ oa->o_flags |= OBD_FL_RECREATE_OBJS;
+ }
}
down(&obd->u.filter.fo_create_lock);
cleanup_phase = 2;
if (dentry->d_inode == NULL) {
- CERROR("%s: trying to BRW to non-existent file "LPU64"\n",
- obd->obd_name, obj->ioo_id);
- GOTO(cleanup, rc = -ENOENT);
+ if (exp->exp_obd->obd_recovering) {
+ struct obdo *noa = oa;
+
+ if (oa == NULL) {
+ OBDO_ALLOC(noa);
+ if (noa == NULL)
+ GOTO(recreate_out, rc = -ENOMEM);
+ noa->o_id = obj->ioo_id;
+ noa->o_valid = OBD_MD_FLID;
+ }
+
+ if (filter_recreate(exp->exp_obd, noa) == 0) {
+ f_dput(dentry);
+ dentry = filter_fid2dentry(exp->exp_obd, NULL,
+ obj->ioo_gr,
+ obj->ioo_id);
+ }
+ if (oa == NULL)
+ OBDO_FREE(noa);
+ }
+ recreate_out:
+ if (IS_ERR(dentry) || dentry->d_inode == NULL) {
+ CERROR("%s: BRW to missing obj "LPU64"/"LPU64":rc %d\n",
+ exp->exp_obd->obd_name,
+ obj->ioo_id, obj->ioo_gr,
+ IS_ERR(dentry) ? (int)PTR_ERR(dentry) : -ENOENT);
+ if (IS_ERR(dentry))
+ cleanup_phase = 1;
+ GOTO(cleanup, rc = -ENOENT);
+ }
}
rc = filter_map_remote_to_local(objcount, obj, nb, pages, res);
rc = rc2;
}
- rc2 =fsfilt_commit_async(obd,inode,oti->oti_handle,wait_handle);
+ if (wait_handle)
+ rc2 = fsfilt_commit_async(obd, inode, oti->oti_handle,
+ wait_handle);
+ else
+ rc2 = fsfilt_commit(obd, inode, oti->oti_handle, 0);
if (rc == 0)
rc = rc2;
if (rc != 0)
int i, err, cleanup_phase = 0;
struct obd_device *obd = exp->exp_obd;
struct filter_obd *fo = &obd->u.filter;
- void *wait_handle;
+ void *wait_handle = NULL;
int total_size = 0;
int rec_pending = 0;
unsigned int qcids[MAXQUOTAS] = {0, 0};
+ int sync_journal_commit = obd->u.filter.fo_syncjournal;
ENTRY;
LASSERT(oti != NULL);
(lnb->flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
OBD_BRW_FROM_GRANT)
iobuf->dr_ignore_quota = 1;
+
+ if (!(lnb->flags & OBD_BRW_ASYNC)) {
+ sync_journal_commit = 1;
+ }
}
push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL);
/* filter_direct_io drops i_mutex */
rc = filter_direct_io(OBD_BRW_WRITE, res->dentry, iobuf, exp, &iattr,
- oti, &wait_handle);
+ oti, sync_journal_commit ? &wait_handle : NULL);
if (rc == 0)
obdo_from_inode(oa, inode,
FILTER_VALID_FLAGS |OBD_MD_FLUID |OBD_MD_FLGID);
fsfilt_check_slow(obd, now, "direct_io");
- err = fsfilt_commit_wait(obd, inode, wait_handle);
+ if (wait_handle)
+ err = fsfilt_commit_wait(obd, inode, wait_handle);
+ else
+ err = 0;
+
if (err) {
CERROR("Failure to commit OST transaction (%d)?\n", err);
- rc = err;
+ if (rc == 0)
+ rc = err;
}
- if (obd->obd_replayable && !rc)
+ if (obd->obd_replayable && !rc && wait_handle)
LASSERTF(oti->oti_transno <= obd->obd_last_committed,
"oti_transno "LPU64" last_committed "LPU64"\n",
oti->oti_transno, obd->obd_last_committed);
return count;
}
+int lprocfs_filter_rd_syncjournal(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct obd_device *obd = data;
+ int rc;
+
+ rc = snprintf(page, count, "%u\n", obd->u.filter.fo_syncjournal);
+ return rc;
+}
+
+int lprocfs_filter_wr_syncjournal(struct file *file, const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = data;
+ int val;
+ int rc;
+
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+
+ if (val < 0)
+ return -EINVAL;
+
+ obd->u.filter.fo_syncjournal = !!val;
+ return count;
+}
+
static struct lprocfs_vars lprocfs_filter_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
{ "blocksize", lprocfs_rd_blksize, 0, 0 },
lprocfs_obd_wr_stale_export_age, 0},
{ "flush_stale_exports", 0, lprocfs_obd_wr_flush_stale_exports, 0 },
#endif
+ { "sync_journal", lprocfs_filter_rd_syncjournal,
+ lprocfs_filter_wr_syncjournal, 0 },
{ 0 }
};
static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
{
if (p1->flag != p2->flag) {
- unsigned mask = ~OBD_BRW_FROM_GRANT;
+ unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_ASYNC);
/* warn if we try to combine flags that we don't know to be
* safe to combine */
}
OBDO_FREE(aa->aa_oa);
} else { /* from async_internal() */
- int i;
+ obd_count i;
for (i = 0; i < aa->aa_page_count; i++)
osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1);
}
client_obd_list_unlock(&cli->cl_loi_list_lock);
osc_release_ppga(aa->aa_ppga, aa->aa_page_count);
+
RETURN(rc);
}
#include "ost_internal.h"
#ifdef LPROCFS
+static char *sync_on_cancel_states[] = {"never",
+ "blocking",
+ "always" };
+
+int lprocfs_ost_rd_ost_sync_on_lock_cancel(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ struct obd_device *obd = data;
+ int rc;
+
+ rc = snprintf(page, count, "%s\n",
+ sync_on_cancel_states[obd->u.ost.ost_sync_on_lock_cancel]);
+ return rc;
+}
+
+int lprocfs_ost_wr_ost_sync_on_lock_cancel(struct file *file,
+ const char *buffer,
+ unsigned long count, void *data)
+{
+ struct obd_device *obd = data;
+ int val = -1;
+ int i;
+
+ for (i = 0 ; i < NUM_SYNC_ON_CANCEL_STATES; i++) {
+ if (memcmp(buffer, sync_on_cancel_states[i],
+ strlen(sync_on_cancel_states[i])) == 0) {
+ val = i;
+ break;
+ }
+ }
+ if (val == -1) {
+ int rc;
+ rc = lprocfs_write_helper(buffer, count, &val);
+ if (rc)
+ return rc;
+ }
+
+ if (val < 0 || val > 2)
+ return -EINVAL;
+
+ obd->u.ost.ost_sync_on_lock_cancel = val;
+ return count;
+}
+
static struct lprocfs_vars lprocfs_ost_obd_vars[] = {
{ "uuid", lprocfs_rd_uuid, 0, 0 },
+ { "sync_on_lock_cancel", lprocfs_ost_rd_ost_sync_on_lock_cancel,
+ lprocfs_ost_wr_ost_sync_on_lock_cancel, 0 },
{ 0 }
};
}
#endif
+/* Ensure that data and metadata are synced to the disk when lock is cancelled
+ * (if requested) */
+int ost_blocking_ast(struct ldlm_lock *lock,
+ struct ldlm_lock_desc *desc,
+ void *data, int flag)
+{
+ struct obd_device *obd = lock->l_export->exp_obd;
+ if (flag == LDLM_CB_CANCELING &&
+ (lock->l_granted_mode & (LCK_PW|LCK_GROUP)) &&
+ (obd->u.ost.ost_sync_on_lock_cancel == ALWAYS_SYNC_ON_CANCEL ||
+ (obd->u.ost.ost_sync_on_lock_cancel == BLOCKING_SYNC_ON_CANCEL &&
+ lock->l_flags & LDLM_FL_CBPENDING))) {
+ struct obdo *oa;
+ int rc;
+
+ OBDO_ALLOC(oa);
+ oa->o_id = lock->l_resource->lr_name.name[0];
+ oa->o_valid = OBD_MD_FLID;
+
+ rc = obd_sync(lock->l_export, oa, NULL,
+ lock->l_policy_data.l_extent.start,
+ lock->l_policy_data.l_extent.end);
+ if (rc)
+ CERROR("Error %d syncing data on lock cancel\n", rc);
+
+ OBDO_FREE(oa);
+ }
+
+ return ldlm_server_blocking_ast(lock, desc, data, flag);
+}
+
static int ost_filter_recovery_request(struct ptlrpc_request *req,
struct obd_device *obd, int *process)
{
CDEBUG(D_INODE, "enqueue\n");
OBD_FAIL_RETURN(OBD_FAIL_LDLM_ENQUEUE, 0);
rc = ldlm_handle_enqueue(req, ldlm_server_completion_ast,
- ldlm_server_blocking_ast,
+ ost_blocking_ast,
ldlm_server_glimpse_ast);
fail = OBD_FAIL_OST_LDLM_REPLY_NET;
break;
sema_init(&ost->ost_health_sem, 1);
+ /* Always sync on lock cancel */
+ ost->ost_sync_on_lock_cancel = ALWAYS_SYNC_ON_CANCEL;
+
if (oss_num_threads) {
/* If oss_num_threads is set, it is the min and the max. */
if (oss_num_threads > OSS_THREADS_MAX)
}
#endif
+enum {
+ NEVER_SYNC_ON_CANCEL = 0,
+ BLOCKING_SYNC_ON_CANCEL = 1,
+ ALWAYS_SYNC_ON_CANCEL = 2,
+ NUM_SYNC_ON_CANCEL_STATES
+};
+
#endif /* OST_INTERNAL_H */
desc->bd_nob += len;
+ cfs_page_pin(page);
ptlrpc_add_bulk_page(desc, page, pageoffset, len);
}
void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
{
+ int i;
ENTRY;
LASSERT(desc != NULL);
else
class_import_put(desc->bd_import);
+ for (i = 0; i < desc->bd_iov_count ; i++)
+ cfs_page_unpin(desc->bd_iov[i].kiov_page);
+
OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc,
bd_iov[desc->bd_max_iov]));
EXIT;
imp->imp_peer_committed_transno =
lustre_msg_get_last_committed(req->rq_repmsg);
ptlrpc_free_committed(imp);
+
+ if (req->rq_transno > imp->imp_peer_committed_transno)
+ ptlrpc_pinger_sending_on_import(imp);
spin_unlock(&imp->imp_lock);
}
ENTRY;
LASSERT(req->rq_import->imp_state == LUSTRE_IMP_REPLAY);
- /* Not handling automatic bulk replay yet (or ever?) */
- LASSERT(req->rq_bulk == NULL);
CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
aa = ptlrpc_req_async_args(req);
/* XXX Registering the same xid on retried bulk makes my head
* explode trying to understand how the original request's bulk
- * might interfere with the retried request -eeb */
- LASSERTF (!desc->bd_registered || req->rq_xid != desc->bd_last_xid,
- "registered: %d rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
- desc->bd_registered, req->rq_xid, desc->bd_last_xid);
+ * might interfere with the retried request -eeb
+ * On the other hand replaying with the same xid is fine, since
+ * we are guaranteed old request have completed. -green */
+ LASSERTF(!(desc->bd_registered &&
+ req->rq_send_state != LUSTRE_IMP_REPLAY) ||
+ req->rq_xid != desc->bd_last_xid,
+ "registered: %d rq_xid: "LPU64" bd_last_xid: "LPU64"\n",
+ desc->bd_registered, req->rq_xid, desc->bd_last_xid);
desc->bd_registered = 1;
desc->bd_last_xid = req->rq_xid;
}
EXPORT_SYMBOL(ptlrpc_ping);
-void ptlrpc_update_next_ping(struct obd_import *imp)
+void ptlrpc_update_next_ping(struct obd_import *imp, int soon)
{
#ifdef ENABLE_PINGER
- int time = PING_INTERVAL;
+ int time = soon ? PING_INTERVAL_SHORT : PING_INTERVAL;
if (imp->imp_state == LUSTRE_IMP_DISCON) {
int dtime = max_t(int, CONNECTION_SWITCH_MIN,
AT_OFF ? 0 :
if (cfs_time_after(imp->imp_next_ping,
cfs_time_add(this_ping,
cfs_time_seconds(PING_INTERVAL))))
- ptlrpc_update_next_ping(imp);
+ ptlrpc_update_next_ping(imp, 0);
}
mutex_up(&pinger_sem);
/* update memory usage info */
void ptlrpc_pinger_sending_on_import(struct obd_import *imp)
{
- ptlrpc_update_next_ping(imp);
+ ptlrpc_update_next_ping(imp, 0);
+}
+
+void ptlrpc_pinger_commit_expected(struct obd_import *imp)
+{
+ ptlrpc_update_next_ping(imp, 1);
}
int ptlrpc_pinger_add_import(struct obd_import *imp)
/* if we add to pinger we want recovery on this import */
imp->imp_obd->obd_no_recov = 0;
- ptlrpc_update_next_ping(imp);
+ ptlrpc_update_next_ping(imp, 0);
/* XXX sort, blah blah */
list_add_tail(&imp->imp_pinger_chain, &pinger_imports);
class_import_get(imp);
{
#ifdef ENABLE_PINGER
mutex_down(&pinger_sem);
- ptlrpc_update_next_ping(imp);
+ ptlrpc_update_next_ping(imp, 0);
if (pinger_args.pd_set == NULL &&
time_before(imp->imp_next_ping, pinger_args.pd_next_ping)) {
CDEBUG(D_HA, "set next ping to "CFS_TIME_T"(cur "CFS_TIME_T")\n",
int ptlrpc_start_pinger(void);
int ptlrpc_stop_pinger(void);
void ptlrpc_pinger_sending_on_import(struct obd_import *imp);
+void ptlrpc_pinger_commit_expected(struct obd_import *imp);
void ptlrpc_pinger_wake_up(void);
void ptlrpc_ping_import_soon(struct obd_import *imp);
#ifdef __KERNEL__
}
run_test 73c "open(O_CREAT), unlink, replay, reconnect at last_replay, close"
+test_80() {
+ do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
+
+ replay_barrier ost1
+ lfs setstripe -i 0 -c 1 $DIR/$tfile
+ dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 || error "Cannot write"
+ cksum=`md5sum $DIR/$tfile | awk '{print $1}'`
+ cancel_lru_locks osc
+ fail ost1
+ dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
+ cksum2=`md5sum $DIR/$tfile | awk '{print $1}'`
+ if [ $cksum != $cksum2 ] ; then
+ error "New checksum $cksum2 does not match original $cksum"
+ fi
+}
+run_test 80 "write replay"
+
+test_80b() {
+ do_facet ost1 "lctl set_param -n obdfilter.${ost1_svc}.sync_journal 0"
+
+ replay_barrier ost1
+ lfs setstripe -i 0 -c 1 $DIR/$tfile
+ dd if=/dev/urandom of=$DIR/$tfile bs=1024k count=8 || error "Cannot write"
+ sleep 1 # Give it a chance to flush dirty data
+ echo TESTTEST | dd of=$DIR/$tfile bs=1 count=8 seek=64
+ cksum=`md5sum $DIR/$tfile | awk '{print $1}'`
+ cancel_lru_locks osc
+ fail ost1
+ dd if=$DIR/$tfile of=/dev/null bs=1024k count=8 || error "Cannot read"
+ cksum2=`md5sum $DIR/$tfile | awk '{print $1}'`
+ if [ $cksum != $cksum2 ] ; then
+ error "New checksum $cksum2 does not match original $cksum"
+ fi
+}
+run_test 80b "write replay with changed data (checksum resend)"
+
equals_msg `basename $0`: test complete, cleaning up
check_and_cleanup_lustre
[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true