From e464b9bbac8e0e900777a4e8a655e156fb901c5a Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Wed, 14 Nov 2012 23:12:11 +0400 Subject: [PATCH] LU-2130 osp: wait until all the requests are processed there is a window between osp_sync_interpret() and osp_sync_request_commit_cb() where opd_syn_rpc_in_progress is not zero while opd_syn_rpc_in_flight can be zero. the assertion in osp_sync_thread() can hit this window or osp_sync_request_commit_cb() can pin request at the point where osp_sync_thread() has already stopped the processing - this would be a deadlock. with this patch osp_sync_thread() will be waiting until all the requests are processed, checking the list of committed requests in a while. Signed-off-by: Alex Zhuravlev Change-Id: I409feb0bbb681e41bff1d41cb1232ef5ef1cbf37 Reviewed-on: http://review.whamcloud.com/4581 Tested-by: Hudson Tested-by: Maloo Reviewed-by: Mike Pershin Reviewed-by: Oleg Drokin --- lustre/osp/osp_sync.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/lustre/osp/osp_sync.c b/lustre/osp/osp_sync.c index 2f9cbd2..4df325a 100644 --- a/lustre/osp/osp_sync.c +++ b/lustre/osp/osp_sync.c @@ -316,12 +316,24 @@ int osp_sync_gap(const struct lu_env *env, struct osp_device *d, static void osp_sync_request_commit_cb(struct ptlrpc_request *req) { struct osp_device *d = req->rq_cb_data; + struct obd_import *imp = req->rq_import; CDEBUG(D_HA, "commit req %p, transno "LPU64"\n", req, req->rq_transno); if (unlikely(req->rq_transno == 0)) return; + if (unlikely(req->rq_transno > imp->imp_peer_committed_transno)) { + /* this request was aborted by the shutdown procedure, + * not committed by the peer. we should preserve llog + * record */ + cfs_spin_lock(&d->opd_syn_lock); + d->opd_syn_rpc_in_progress--; + cfs_spin_unlock(&d->opd_syn_lock); + cfs_waitq_signal(&d->opd_syn_waitq); + return; + } + /* XXX: what if request isn't committed for very long? */ LASSERT(d); LASSERT(req->rq_svc_thread == (void *) OSP_JOB_MAGIC); @@ -865,7 +877,13 @@ static int osp_sync_thread(void *_arg) d->opd_syn_changes, d->opd_syn_rpc_in_progress, d->opd_syn_rpc_in_flight); - osp_sync_process_committed(&env, d); + /* wait till all the requests are completed */ + while (d->opd_syn_rpc_in_progress > 0) { + osp_sync_process_committed(&env, d); + l_wait_event(d->opd_syn_waitq, + d->opd_syn_rpc_in_progress == 0, + &lwi); + } llog_cat_close(&env, llh); rc = llog_cleanup(&env, ctxt); @@ -874,13 +892,6 @@ static int osp_sync_thread(void *_arg) out: thread->t_flags = SVC_STOPPED; - /* - * there might be a race between osp sync thread sending RPCs and - * import invalidation. this can result in RPCs being in ptlrpcd - * till this point. for safete reason let's wait till they are done - */ - l_wait_event(d->opd_syn_waitq, d->opd_syn_rpc_in_flight == 0, &lwi); - cfs_waitq_signal(&thread->t_ctl_waitq); LASSERTF(d->opd_syn_rpc_in_progress == 0, "%s: %d %d %sempty\n", -- 1.8.3.1