Whamcloud - gitweb
LU-1565 ptlrpc: resend CANCEL rpc
authorVitaly Fertman <vitaly_fertman@xyratex.com>
Tue, 6 Nov 2012 19:09:47 +0000 (23:09 +0400)
committerOleg Drokin <green@whamcloud.com>
Thu, 29 Nov 2012 00:28:53 +0000 (19:28 -0500)
it is better to deliver CANCEL rpc to server reliably in the case of:
    RPC timeout, re-connect, CANCEL resend
because server may have sent BL AST and is waiting for this CANCEL.
this avoids possible idle time on server and later client evictions.

CANCEL is always has up-todate lock handle or both enqueue and cancel
are not replayed on recovery, with the exception of the case of:
    BL AST is sent; recovery starts, lock is re-enqueued, BL AST comes
    to client, cancel is created, recovery ends (lock handle has
    changed), CANCEL is sent, its reply gets estale as lock handled is
    not updated in the RPC.
this case is left unfixed and still may result in lock callback
timeout and client eviction, but this race window is much much shorter
than the target case being fixed by this fix.

Also remove lock cancelling from client_common_put_super() as it is
done later in client_disconnect_export().

Change-Id: I1bfe70444299d93c3fb348b737cb9721ea63eda3
Signed-off-by: Vitaly Fertman <vitaly_fertman@xyratex.com>
Reviewed-by: Alexey Lyashkov <alexey_lyashkov@xyratex.com>
Reviewed-by: Andrew Perepechko <Andrew_Perepechko@xyratex.com>
Xyratex-bug-id: MRP-477
Reviewed-on: http://review.whamcloud.com/3189
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Mike Pershin <tappro@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/ldlm/ldlm_lockd.c
lustre/ldlm/ldlm_request.c
lustre/llite/llite_lib.c
lustre/ptlrpc/service.c
lustre/tests/recovery-small.sh

index d7bf1ac..56485d5 100644 (file)
@@ -366,6 +366,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
 #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
 #define OBD_FAIL_PTLRPC_DELAY_IMP_FULL   0x516
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND    0x517
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
index fabb223..52df532 100644 (file)
@@ -2209,7 +2209,8 @@ static int ldlm_cancel_handler(struct ptlrpc_request *req)
         case LDLM_CANCEL:
                 req_capsule_set(&req->rq_pill, &RQF_LDLM_CANCEL);
                 CDEBUG(D_INODE, "cancel\n");
-               if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET))
+               if (CFS_FAIL_CHECK(OBD_FAIL_LDLM_CANCEL_NET) ||
+                   CFS_FAIL_CHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))
                        RETURN(0);
                 rc = ldlm_handle_cancel(req);
                 if (rc)
index 81c2a15..1435d0a 100644 (file)
@@ -1151,8 +1151,6 @@ int ldlm_cli_cancel_req(struct obd_export *exp, cfs_list_t *cancels,
                         ptlrpc_request_free(req);
                         GOTO(out, rc);
                 }
-                req->rq_no_resend = 1;
-                req->rq_no_delay = 1;
 
                 req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
index baddd60..5ee8fb1 100644 (file)
@@ -672,8 +672,6 @@ void client_common_put_super(struct super_block *sb)
         }
 #endif
 
-        obd_cancel_unused(sbi->ll_dt_exp, NULL, 0, NULL);
-
         ll_close_thread_shutdown(sbi->ll_lcq);
 
         cl_sb_fini(sb);
index 168cd4b..97fead8 100644 (file)
@@ -1598,10 +1598,20 @@ static int ptlrpc_server_request_add(struct ptlrpc_service_part *svcpt,
 static int ptlrpc_server_allow_high(struct ptlrpc_service_part *svcpt,
                                    int force)
 {
+       int running = svcpt->scp_nthrs_running;
+
        if (force)
                return 1;
 
-       if (svcpt->scp_nreqs_active >= svcpt->scp_nthrs_running - 1)
+       if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+                    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+               /* leave just 1 thread for normal RPCs */
+               running = PTLRPC_NTHRS_INIT;
+               if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+                       running += 1;
+       }
+
+       if (svcpt->scp_nreqs_active >= running - 1)
                return 0;
 
        if (svcpt->scp_nhreqs_active == 0)
@@ -1630,15 +1640,24 @@ static int ptlrpc_server_high_pending(struct ptlrpc_service_part *svcpt,
 static int ptlrpc_server_allow_normal(struct ptlrpc_service_part *svcpt,
                                      int force)
 {
+       int running = svcpt->scp_nthrs_running;
 #ifndef __KERNEL__
        if (1) /* always allow to handle normal request for liblustre */
                return 1;
 #endif
+       if (unlikely(svcpt->scp_service->srv_req_portal == MDS_REQUEST_PORTAL &&
+                    CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CANCEL_RESEND))) {
+               /* leave just 1 thread for normal RPCs */
+               running = PTLRPC_NTHRS_INIT;
+               if (svcpt->scp_service->srv_ops.so_hpreq_handler != NULL)
+                       running += 1;
+       }
+
        if (force ||
-           svcpt->scp_nreqs_active < svcpt->scp_nthrs_running - 2)
+           svcpt->scp_nreqs_active < running - 2)
                return 1;
 
-       if (svcpt->scp_nreqs_active >= svcpt->scp_nthrs_running - 1)
+       if (svcpt->scp_nreqs_active >= running - 1)
                return 0;
 
        return svcpt->scp_nhreqs_active > 0 ||
index bd261b3..804e055 100755 (executable)
@@ -446,6 +446,36 @@ test_19b() {
 }
 run_test 19b "test expired_lock_main on ost (2867)"
 
+test_19c() {
+       local BEFORE=`date +%s`
+
+       mount_client $DIR2
+       $LCTL set_param ldlm.namespaces.*.early_lock_cancel=0
+
+       mkdir -p $DIR1/$tfile
+       stat $DIR1/$tfile
+
+#define OBD_FAIL_PTLRPC_CANCEL_RESEND 0x516
+       do_facet mds $LCTL set_param fail_loc=0x80000516
+
+       touch $DIR2/$tfile/file1 &
+       PID1=$!
+       # let touch to get blocked on the server
+       sleep 2
+
+       wait $PID1
+       $LCTL set_param ldlm.namespaces.*.early_lock_cancel=1
+       umount_client $DIR2
+
+       # let the client reconnect
+       sleep 5
+       EVICT=$(do_facet client $LCTL get_param mdc.$FSNAME-MDT*.state |
+          awk -F"[ [,]" '/EVICTED]$/ { if (mx<$4) {mx=$4;} } END { print mx }')
+
+       [ -z "$EVICT" ] || [[ $EVICT -le $BEFORE ]] || error "eviction happened"
+}
+run_test 19c "check reconnect and lock resend do not trigger expired_lock_main"
+
 test_20a() {   # bug 2983 - ldlm_handle_enqueue cleanup
        remote_ost_nodsh && skip "remote OST with nodsh" && return 0