Whamcloud - gitweb
LU-5581 ldlm: evict clients returning errors on ASTs 52/11752/5
authorAlexey Lyashkov <alexey_lyashkov@xyratex.com>
Sun, 2 Nov 2014 16:53:48 +0000 (11:53 -0500)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 4 Dec 2014 17:53:02 +0000 (17:53 +0000)
When a client returns an error other then EINVAL replying to blocking
ast, it is unsafe to cancel the lock on server side only, because the
client may continue its I/O assuming it still owns the lock while the
real lock may be granted already to another client.

In only valid error case when client replied to AST with EINVAL cancel
the lock and return ERESTART, evict the client in any other error
case.

Xyratex-bug-id: MRP-2041
Signed-off-by: Alexey Lyashkov <alexey_lyashkov@xyratex.com>
Change-Id: Ibce60ce3b2c24ba388155ac49cba8f20388893e7
Reviewed-on: http://review.whamcloud.com/11752
Tested-by: Jenkins
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: James Simmons <uja.ornl@gmail.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
lustre/ldlm/ldlm_lockd.c
lustre/tests/recovery-small.sh
lustre/tests/test-framework.sh

index 2f49844..3a569c9 100644 (file)
@@ -629,57 +629,57 @@ static int ldlm_handle_ast_error(struct ldlm_lock *lock,
                                  struct ptlrpc_request *req, int rc,
                                  const char *ast_type)
 {
                                  struct ptlrpc_request *req, int rc,
                                  const char *ast_type)
 {
-        lnet_process_id_t peer = req->rq_import->imp_connection->c_peer;
-
-        if (rc == -ETIMEDOUT || rc == -EINTR || rc == -ENOTCONN) {
-                LASSERT(lock->l_export);
-                if (lock->l_export->exp_libclient) {
-                        LDLM_DEBUG(lock, "%s AST to liblustre client (nid %s)"
-                                   " timeout, just cancelling lock", ast_type,
-                                   libcfs_nid2str(peer.nid));
-                        ldlm_lock_cancel(lock);
-                        rc = -ERESTART;
-               } else if (ldlm_is_cancel(lock)) {
-                        LDLM_DEBUG(lock, "%s AST timeout from nid %s, but "
-                                   "cancel was received (AST reply lost?)",
-                                   ast_type, libcfs_nid2str(peer.nid));
-                        ldlm_lock_cancel(lock);
-                        rc = -ERESTART;
-                } else {
-                        ldlm_del_waiting_lock(lock);
-                        ldlm_failed_ast(lock, rc, ast_type);
-                }
-        } else if (rc) {
-                if (rc == -EINVAL) {
-                        struct ldlm_resource *res = lock->l_resource;
-                        LDLM_DEBUG(lock, "client (nid %s) returned %d"
-                               " from %s AST - normal race",
-                               libcfs_nid2str(peer.nid),
-                               req->rq_repmsg ?
-                               lustre_msg_get_status(req->rq_repmsg) : -1,
-                               ast_type);
-                        if (res) {
-                                /* update lvbo to return proper attributes.
-                                 * see bug 23174 */
-                                ldlm_resource_getref(res);
-                                ldlm_res_lvbo_update(res, NULL, 1);
-                                ldlm_resource_putref(res);
-                        }
+       lnet_process_id_t peer = req->rq_import->imp_connection->c_peer;
 
 
-                } else {
-                       LDLM_ERROR(lock, "client (nid %s) returned %d: rc = %d "
-                                  "from %s AST", libcfs_nid2str(peer.nid),
+       if (!req->rq_replied || (rc && rc != -EINVAL)) {
+               if (lock->l_export && lock->l_export->exp_libclient) {
+                       LDLM_DEBUG(lock, "%s AST to liblustre client (nid %s)"
+                                  " timeout, just cancelling lock", ast_type,
+                                  libcfs_nid2str(peer.nid));
+                       ldlm_lock_cancel(lock);
+                       rc = -ERESTART;
+               } else if (ldlm_is_cancel(lock)) {
+                       LDLM_DEBUG(lock, "%s AST timeout from nid %s, but "
+                                  "cancel was received (AST reply lost?)",
+                                  ast_type, libcfs_nid2str(peer.nid));
+                       ldlm_lock_cancel(lock);
+                       rc = -ERESTART;
+               } else {
+                       LDLM_ERROR(lock, "client (nid %s) %s %s AST "
+                                  "(req status %d rc %d), evict it",
+                                  libcfs_nid2str(peer.nid),
+                                  req->rq_replied ? "returned error from" :
+                                  "failed to reply to",
+                                  ast_type,
                                   (req->rq_repmsg != NULL) ?
                                   lustre_msg_get_status(req->rq_repmsg) : 0,
                                   (req->rq_repmsg != NULL) ?
                                   lustre_msg_get_status(req->rq_repmsg) : 0,
-                                  rc, ast_type);
-                }
-                ldlm_lock_cancel(lock);
-                /* Server-side AST functions are called from ldlm_reprocess_all,
-                 * which needs to be told to please restart its reprocessing. */
-                rc = -ERESTART;
-        }
+                                  rc);
+                       ldlm_failed_ast(lock, rc, ast_type);
+               }
+               return rc;
+       }
 
 
-        return rc;
+       if (rc == -EINVAL) {
+               struct ldlm_resource *res = lock->l_resource;
+
+               LDLM_DEBUG(lock, "client (nid %s) returned %d"
+                          " from %s AST - normal race",
+                          libcfs_nid2str(peer.nid),
+                          req->rq_repmsg ?
+                          lustre_msg_get_status(req->rq_repmsg) : -1,
+                          ast_type);
+               if (res) {
+                       /* update lvbo to return proper attributes.
+                        * see bug 23174 */
+                       ldlm_resource_getref(res);
+                       ldlm_res_lvbo_update(res, NULL, 1);
+                       ldlm_resource_putref(res);
+               }
+               ldlm_lock_cancel(lock);
+               rc = -ERESTART;
+       }
+
+       return rc;
 }
 
 static int ldlm_cb_interpret(const struct lu_env *env,
 }
 
 static int ldlm_cb_interpret(const struct lu_env *env,
@@ -2165,8 +2165,11 @@ static int ldlm_callback_handler(struct ptlrpc_request *req)
 
        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
        case LDLM_BL_CALLBACK:
 
        switch (lustre_msg_get_opc(req->rq_reqmsg)) {
        case LDLM_BL_CALLBACK:
-               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET))
+               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_BL_CALLBACK_NET)) {
+                       if (cfs_fail_err)
+                               ldlm_callback_reply(req, -(int)cfs_fail_err);
                        RETURN(0);
                        RETURN(0);
+               }
                break;
        case LDLM_CP_CALLBACK:
                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
                break;
        case LDLM_CP_CALLBACK:
                if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CP_CALLBACK_NET))
index 13a0f3e..0573e30 100755 (executable)
@@ -196,6 +196,41 @@ test_10b() {
 }
 run_test 10b "re-send BL AST"
 
 }
 run_test 10b "re-send BL AST"
 
+test_10d() {
+       local before=$(date +%s)
+       local evict
+       # sleep 1 is to make sure that BEFORE is not equal to EVICTED below
+       sleep 1
+       rm -f $TMP/$tfile
+       echo -n ", world" | dd of=$TMP/$tfile bs=1c seek=5
+
+       mount_client $MOUNT2
+
+       $LFS setstripe -i 0 -c 1 $DIR1/$tfile
+       echo -n hello > $DIR1/$tfile
+
+       stat $DIR2/$tfile >& /dev/null
+       $LCTL set_param fail_err=71
+       drop_bl_callback_once "echo -n \\\", world\\\" >> $DIR2/$tfile"
+
+       client_reconnect
+
+       cmp $DIR1/$tfile $DIR2/$tfile || error "file contents differ"
+       cmp $DIR1/$tfile $TMP/$tfile || error "wrong content found"
+
+       evict=$(do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state | \
+               tr -d '\-\[\] ' | \
+         awk -F"[ [,]" '/EVICTED$/ { if (mx<$1) {mx=$1;} } END { print mx }')
+
+       [[ $evict -gt $before ]] ||
+               (do_facet client $LCTL get_param osc.$FSNAME-OST0000*.state;
+                   error "no eviction: $evict before:$before")
+
+       rm $TMP/$tfile
+       umount_client $MOUNT2
+}
+run_test 10d "test failed blocking ast"
+
 #bug 2460
 # wake up a thread waiting for completion after eviction
 test_11(){
 #bug 2460
 # wake up a thread waiting for completion after eviction
 test_11(){
index 18ebafa..43d0b93 100755 (executable)
@@ -4474,12 +4474,13 @@ drop_ldlm_cancel() {
 }
 
 drop_bl_callback_once() {
 }
 
 drop_bl_callback_once() {
-       rc=0
+       local rc=0
        do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
 #define OBD_FAIL_LDLM_BL_CALLBACK_NET                  0x305
        do_facet client lctl set_param fail_loc=0x80000305
        do_facet client "$@" || rc=$?
        do_facet client lctl set_param fail_loc=0
        do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=0
 #define OBD_FAIL_LDLM_BL_CALLBACK_NET                  0x305
        do_facet client lctl set_param fail_loc=0x80000305
        do_facet client "$@" || rc=$?
        do_facet client lctl set_param fail_loc=0
+       do_facet client lctl set_param fail_val=0
        do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
        return $rc
 }
        do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
        return $rc
 }
@@ -4491,6 +4492,7 @@ drop_bl_callback() {
        do_facet client lctl set_param fail_loc=0x305
        do_facet client "$@" || rc=$?
        do_facet client lctl set_param fail_loc=0
        do_facet client lctl set_param fail_loc=0x305
        do_facet client "$@" || rc=$?
        do_facet client lctl set_param fail_loc=0
+       do_facet client lctl set_param fail_val=0
        do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
        return $rc
 }
        do_facet client lctl set_param ldlm.namespaces.*.early_lock_cancel=1
        return $rc
 }