Whamcloud - gitweb
LU-1039 ptlrpc: handle bulk IO errors correctly.
authorAlexey Lyashkov <alexey_lyashkov@xyratex.com>
Thu, 26 Jan 2012 12:39:12 +0000 (16:39 +0400)
committerOleg Drokin <green@whamcloud.com>
Fri, 2 Mar 2012 17:05:12 +0000 (12:05 -0500)
don't panic on incorrect bulk transfer,
correctly handle a bulk request reorder.

LustreError: 12927:0:(client.c:1696:ptlrpc_check_set()) LBUG
Pid: 12927, comm: ptlrpcd-brw

Call Trace:
[<ffffffffa083c865>] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[<ffffffffa083cea5>] lbug_with_loc+0x75/0xe0 [libcfs]
[<ffffffffa0e2edd6>] ptlrpc_check_set+0x1a66/0x1b90 [ptlrpc]
[<ffffffffa0e61006>] ? ptlrpcd_check+0x46/0x290 [ptlrpc]
[<ffffffffa0e61006>] ? ptlrpcd_check+0x46/0x290 [ptlrpc]
[<ffffffffa0e611b8>] ptlrpcd_check+0x1f8/0x290 [ptlrpc]
[<ffffffffa0e6157b>] ptlrpcd+0x32b/0x3b0 [ptlrpc]
[<ffffffff81056388>] ? finish_task_switch+0x48/0x110
[<ffffffff8105f500>] ? default_wake_function+0x0/0x20
[<ffffffffa0e61250>] ? ptlrpcd+0x0/0x3b0 [ptlrpc]
[<ffffffff8100c2ca>] child_rip+0xa/0x20
[<ffffffff81500b70>] ? _spin_unlock_irq+0x30/0x40
[<ffffffff8100bc10>] ? restore_args+0x0/0x30
[<ffffffffa0e61250>] ? ptlrpcd+0x0/0x3b0 [ptlrpc]
[<ffffffff8100c2c0>] ? child_rip+0x0/0x20

Xyratex-bug-id: MRP-303
Change-Id: Ibb19c33e92dbecf5d029c7e6c567f65fb764f444
Signed-off-by: Alexey Lyashkov <alexey_lyashkov@xyratex.com>
Signed-off-by: Oleg Drokin <green@whamcloud.com>
Reviewed-on: http://review.whamcloud.com/2023
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Reviewed-by: Johann Lombardi <johann@whamcloud.com>
lustre/include/obd_support.h
lustre/ost/ost_handler.c
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/tests/sanity.sh

index 0b29238..db9e46f 100644 (file)
@@ -359,6 +359,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
 #define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
 #define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
index 4cd4b07..cbe543b 100644 (file)
@@ -783,7 +783,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
         /* Check if client was evicted while we were doing i/o before touching
            network */
         if (rc == 0) {
-                rc = target_bulk_io(exp, desc, &lwi);
+                if (likely(!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)))
+                        rc = target_bulk_io(exp, desc, &lwi);
                 no_reply = rc != 0;
         }
 
@@ -800,7 +801,7 @@ out_lock:
 out_tls:
         ost_tls_put(req);
 out_bulk:
-        if (desc)
+        if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
                 ptlrpc_free_bulk(desc);
 out:
         LASSERT(rc <= 0);
@@ -824,6 +825,20 @@ out:
                       exp->exp_connection->c_remote_uuid.uuid,
                       libcfs_id2str(req->rq_peer));
         }
+        /* send a bulk after reply to simulate a network delay or reordering
+         * by a router */
+        if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+                cfs_waitq_t              waitq;
+                struct l_wait_info       lwi1;
+
+                CDEBUG(D_INFO, "reorder BULK\n");
+                cfs_waitq_init(&waitq);
+
+                lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL);
+                l_wait_event(waitq, 0, &lwi1);
+                rc = target_bulk_io(exp, desc, &lwi);
+                ptlrpc_free_bulk(desc);
+        }
 
         RETURN(rc);
 }
index 0a2ab28..30c7cfe 100644 (file)
@@ -1690,7 +1690,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                          * process the reply. Similarly if the RPC returned
                          * an error, and therefore the bulk will never arrive.
                          */
-                        if (req->rq_bulk == NULL || req->rq_status != 0) {
+                        if (req->rq_bulk == NULL || req->rq_status < 0) {
                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                                 GOTO(interpret, req->rq_status);
                         }
@@ -1708,7 +1708,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                          * was good after getting the REPLY for her GET or
                          * the ACK for her PUT. */
                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
-                        LBUG();
+                        req->rq_status = -EIO;
                 }
 
                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
index 5c37070..6cb8e8e 100644 (file)
@@ -189,6 +189,12 @@ void client_bulk_callback (lnet_event_t *ev)
                  ev->type == LNET_EVENT_UNLINK);
         LASSERT (ev->unlinked);
 
+        if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+                ev->status = -EIO;
+
+        if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+                ev->status = -EIO;
+
         CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
                "event type %d, status %d, desc %p\n",
                ev->type, ev->status, desc);
index 7aa65ed..fae6a60 100644 (file)
@@ -8586,6 +8586,26 @@ test_223 () {
 }
 run_test 223 "osc reenqueue if without AGL lock granted ======================="
 
+test_224a() { # LU-1039, MRP-303
+        #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+        $LCTL set_param fail_loc=0x508
+        dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync
+        $LCTL set_param fail_loc=0
+        df $DIR
+}
+run_test 224a "Don't panic on bulk IO failure"
+
+test_224b() { # LU-1039, MRP-303
+        dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+        cancel_lru_locks osc
+        #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2   0x515
+        $LCTL set_param fail_loc=0x515
+        dd of=/dev/null if=$DIR/$tfile bs=4096 count=1
+        $LCTL set_param fail_loc=0
+        df $DIR
+}
+run_test 224b "Don't panic on bulk IO failure"
+
 #
 # tests that do cleanup/setup should be run at the end
 #