Whamcloud - gitweb
LU-2130 osp: wait until all the requests are processed
authorAlex Zhuravlev <alexey.zhuravlev@intel.com>
Wed, 14 Nov 2012 19:12:11 +0000 (23:12 +0400)
committerOleg Drokin <green@whamcloud.com>
Sun, 18 Nov 2012 20:33:55 +0000 (15:33 -0500)
there is a window between osp_sync_interpret() and
osp_sync_request_commit_cb() where opd_syn_rpc_in_progress
is not zero while opd_syn_rpc_in_flight can be zero.
the assertion in osp_sync_thread() can hit this window or
osp_sync_request_commit_cb() can pin request at the point
where osp_sync_thread() has already stopped the processing
- this would be a deadlock.

with this patch osp_sync_thread() will be waiting until all
the requests are processed, checking the list of committed
requests in a while.

Signed-off-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Change-Id: I409feb0bbb681e41bff1d41cb1232ef5ef1cbf37
Reviewed-on: http://review.whamcloud.com/4581
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Mike Pershin <tappro@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/osp/osp_sync.c

index 2f9cbd2..4df325a 100644 (file)
@@ -316,12 +316,24 @@ int osp_sync_gap(const struct lu_env *env, struct osp_device *d,
 static void osp_sync_request_commit_cb(struct ptlrpc_request *req)
 {
        struct osp_device *d = req->rq_cb_data;
+       struct obd_import *imp = req->rq_import;
 
        CDEBUG(D_HA, "commit req %p, transno "LPU64"\n", req, req->rq_transno);
 
        if (unlikely(req->rq_transno == 0))
                return;
 
+       if (unlikely(req->rq_transno > imp->imp_peer_committed_transno)) {
+               /* this request was aborted by the shutdown procedure,
+                * not committed by the peer.  we should preserve llog
+                * record */
+               cfs_spin_lock(&d->opd_syn_lock);
+               d->opd_syn_rpc_in_progress--;
+               cfs_spin_unlock(&d->opd_syn_lock);
+               cfs_waitq_signal(&d->opd_syn_waitq);
+               return;
+       }
+
        /* XXX: what if request isn't committed for very long? */
        LASSERT(d);
        LASSERT(req->rq_svc_thread == (void *) OSP_JOB_MAGIC);
@@ -865,7 +877,13 @@ static int osp_sync_thread(void *_arg)
                 d->opd_syn_changes, d->opd_syn_rpc_in_progress,
                 d->opd_syn_rpc_in_flight);
 
-       osp_sync_process_committed(&env, d);
+       /* wait till all the requests are completed */
+       while (d->opd_syn_rpc_in_progress > 0) {
+               osp_sync_process_committed(&env, d);
+               l_wait_event(d->opd_syn_waitq,
+                            d->opd_syn_rpc_in_progress == 0,
+                            &lwi);
+       }
 
        llog_cat_close(&env, llh);
        rc = llog_cleanup(&env, ctxt);
@@ -874,13 +892,6 @@ static int osp_sync_thread(void *_arg)
 out:
        thread->t_flags = SVC_STOPPED;
 
-       /*
-        * there might be a race between osp sync thread sending RPCs and
-        * import invalidation. this can result in RPCs being in ptlrpcd
-        * till this point. for safete reason let's wait till they are done
-        */
-       l_wait_event(d->opd_syn_waitq, d->opd_syn_rpc_in_flight == 0, &lwi);
-
        cfs_waitq_signal(&thread->t_ctl_waitq);
        LASSERTF(d->opd_syn_rpc_in_progress == 0,
                 "%s: %d %d %sempty\n",