Whamcloud - gitweb
LU-1039 ptlrpc: handle bulk IO errors correctly.
authorEmoly Liu <emoly.liu@intel.com>
Fri, 9 Nov 2012 06:51:29 +0000 (14:51 +0800)
committerOleg Drokin <green@whamcloud.com>
Wed, 21 Nov 2012 05:37:09 +0000 (00:37 -0500)
Don't panic on incorrect bulk transfer,
correctly handle a bulk request reorder.

LustreError: 12927:0:(client.c:1696:ptlrpc_check_set())LBUG
Pid: 12927, comm: ptlrpcd-brw

Call Trace:
[<ffffffffa083c865>] libcfs_debug_dumpstack+0x55/0x80 [libcfs]
[<ffffffffa083cea5>] lbug_with_loc+0x75/0xe0 [libcfs]
[<ffffffffa0e2edd6>] ptlrpc_check_set+0x1a66/0x1b90 [ptlrpc]
[<ffffffffa0e61006>] ? ptlrpcd_check+0x46/0x290 [ptlrpc]
[<ffffffffa0e61006>] ? ptlrpcd_check+0x46/0x290 [ptlrpc]
[<ffffffffa0e611b8>] ptlrpcd_check+0x1f8/0x290 [ptlrpc]
[<ffffffffa0e6157b>] ptlrpcd+0x32b/0x3b0 [ptlrpc]
[<ffffffff81056388>] ? finish_task_switch+0x48/0x110
[<ffffffff8105f500>] ? default_wake_function+0x0/0x20
[<ffffffffa0e61250>] ? ptlrpcd+0x0/0x3b0 [ptlrpc]
[<ffffffff8100c2ca>] child_rip+0xa/0x20
[<ffffffff81500b70>] ? _spin_unlock_irq+0x30/0x40
[<ffffffff8100bc10>] ? restore_args+0x0/0x30
[<ffffffffa0e61250>] ? ptlrpcd+0x0/0x3b0 [ptlrpc]
[<ffffffff8100c2c0>] ? child_rip+0x0/0x20

port of master patch c9590221dc43dd5e7a7ede389f0a7d9cf566e5bf

Xyratex-bug-id: MRP-303
Signed-off-by: Alexey Lyashkov <alexey_lyashkov@xyratex.com>
Signed-off-by: Oleg Drokin <green@whamcloud.com>
Signed-off-by: Liu Ying <emoly.liu@intel.com>
Change-Id: Ie6d6c0f024605b2c6a7302cb5c424610b098f3e2
Reviewed-on: http://review.whamcloud.com/4499
Tested-by: Hudson
Tested-by: Maloo <whamcloud.maloo@gmail.com>
lustre/include/obd_support.h
lustre/ost/ost_handler.c
lustre/ptlrpc/client.c
lustre/ptlrpc/events.c
lustre/tests/sanity.sh

index 87557b4..c6996d8 100644 (file)
@@ -348,6 +348,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type,
 #define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT  0x512
 #define OBD_FAIL_PTLRPC_DROP_REQ_OPC     0x513
 #define OBD_FAIL_PTLRPC_FINISH_REPLAY    0x514
+#define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2  0x515
 
 #define OBD_FAIL_OBD_PING_NET            0x600
 #define OBD_FAIL_OBD_LOG_CANCEL_NET      0x601
index 1541a12..e31e254 100644 (file)
@@ -784,7 +784,8 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
         /* Check if client was evicted while we were doing i/o before touching
            network */
         if (rc == 0) {
-                rc = target_bulk_io(exp, desc, &lwi);
+                if (likely(!CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2)))
+                        rc = target_bulk_io(exp, desc, &lwi);
                 no_reply = rc != 0;
         }
 
@@ -801,7 +802,7 @@ out_lock:
 out_tls:
         ost_tls_put(req);
 out_bulk:
-        if (desc)
+        if (desc && !CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))
                 ptlrpc_free_bulk(desc);
 out:
         LASSERT(rc <= 0);
@@ -824,6 +825,20 @@ out:
                               obd_uuid2str(&exp->exp_client_uuid),
                               obd_export_nid2str(exp), rc);
         }
+        /* send a bulk after reply to simulate a network delay or reordering
+         * by a router */
+        if (unlikely(CFS_FAIL_PRECHECK(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2))) {
+                cfs_waitq_t              waitq;
+                struct l_wait_info       lwi1;
+
+                CDEBUG(D_INFO, "reorder BULK\n");
+                cfs_waitq_init(&waitq);
+
+                lwi1 = LWI_TIMEOUT_INTR(cfs_time_seconds(3), NULL, NULL, NULL);
+                l_wait_event(waitq, 0, &lwi1);
+                rc = target_bulk_io(exp, desc, &lwi);
+                ptlrpc_free_bulk(desc);
+        }
 
         RETURN(rc);
 }
index 4f65709..c233cf4 100644 (file)
@@ -1682,7 +1682,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                          * process the reply. Similarly if the RPC returned
                          * an error, and therefore the bulk will never arrive.
                          */
-                        if (req->rq_bulk == NULL || req->rq_status != 0) {
+                        if (req->rq_bulk == NULL || req->rq_status < 0) {
                                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
                                 GOTO(interpret, req->rq_status);
                         }
@@ -1700,7 +1700,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
                          * was good after getting the REPLY for her GET or
                          * the ACK for her PUT. */
                         DEBUG_REQ(D_ERROR, req, "bulk transfer failed");
-                        LBUG();
+                        req->rq_status = -EIO;
                 }
 
                 ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET);
index 5c37070..6cb8e8e 100644 (file)
@@ -189,6 +189,12 @@ void client_bulk_callback (lnet_event_t *ev)
                  ev->type == LNET_EVENT_UNLINK);
         LASSERT (ev->unlinked);
 
+        if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB, CFS_FAIL_ONCE))
+                ev->status = -EIO;
+
+        if (CFS_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_CLIENT_BULK_CB2,CFS_FAIL_ONCE))
+                ev->status = -EIO;
+
         CDEBUG((ev->status == 0) ? D_NET : D_ERROR,
                "event type %d, status %d, desc %p\n",
                ev->type, ev->status, desc);
index 47330e6..bc1c1b9 100644 (file)
@@ -8513,6 +8513,26 @@ test_221() {
 }
 run_test 221 "make sure fault and truncate race to not cause OOM"
 
+test_222a() { # LU-1039, MRP-303
+        #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB   0x508
+        $LCTL set_param fail_loc=0x508
+        dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1 conv=fsync
+        $LCTL set_param fail_loc=0
+        df $DIR
+}
+run_test 222a "Don't panic on bulk IO failure"
+
+test_222b() { # LU-1039, MRP-303
+        dd if=/dev/zero of=$DIR/$tfile bs=4096 count=1
+        cancel_lru_locks osc
+        #define OBD_FAIL_PTLRPC_CLIENT_BULK_CB2   0x515
+        $LCTL set_param fail_loc=0x515
+        dd of=/dev/null if=$DIR/$tfile bs=4096 count=1
+        $LCTL set_param fail_loc=0
+        df $DIR
+}
+run_test 222b "Don't panic on bulk IO failure"
+
 #
 # tests that do cleanup/setup should be run at the end
 #