Whamcloud - gitweb
Branch b1_6
authorjohann <johann>
Mon, 9 Feb 2009 15:18:15 +0000 (15:18 +0000)
committerjohann <johann>
Mon, 9 Feb 2009 15:18:15 +0000 (15:18 +0000)
b=18374
i=johann (patch from Tappro)
i=shadow

Return only valid cookies, empty slot for failed ones.
This fixes a memory corruption causing random oops.

lustre/include/obd_support.h
lustre/lov/lov_log.c
lustre/tests/replay-single.sh

index b31ebf6..f7df162 100644 (file)
@@ -170,6 +170,7 @@ extern unsigned int obd_alloc_fail_rate;
 #define OBD_FAIL_MDS_CLOSE_NET_REP       0x13b
 #define OBD_FAIL_MDS_BLOCK_QUOTA_REQ     0x13c
 #define OBD_FAIL_MDS_DROP_QUOTA_REQ      0x13d
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD    0x140
 
 #define OBD_FAIL_OST                     0x200
 #define OBD_FAIL_OST_CONNECT_NET         0x201
index 59d2e9d..d4964b0 100644 (file)
@@ -74,7 +74,7 @@ static int lov_llog_origin_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
 {
         struct obd_device *obd = ctxt->loc_obd;
         struct lov_obd *lov = &obd->u.lov;
-        int i, rc = 0;
+        int i, rc = 0, cookies = 0;
         ENTRY;
 
         LASSERTF(logcookies && numcookies >= lsm->lsm_stripe_count, 
@@ -105,12 +105,25 @@ static int lov_llog_origin_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec,
                         break;
                 }
 
-                rc += llog_add(cctxt, rec, NULL, logcookies + rc,
-                                numcookies - rc);
+                /* inject error in llog_add() below */
+                if (OBD_FAIL_CHECK(OBD_FAIL_MDS_FAIL_LOV_LOG_ADD)) {
+                        llog_ctxt_put(cctxt);
+                        cctxt = NULL;
+                }
+                rc = llog_add(cctxt, rec, NULL, logcookies + cookies,
+                              numcookies - cookies);
                 llog_ctxt_put(cctxt);
+                if (rc < 0) {
+                        CERROR("Can't add llog (rc = %d) for stripe %i\n",
+                               rc, cookies);
+                        memset(logcookies + cookies, 0, sizeof(struct llog_cookie));
+                        rc = 1; /* skip this cookie */
+                }
+                /* Note that rc is always 1 if llog_add was successful */
+                cookies += rc;
         }
 
-        RETURN(rc);
+        RETURN(cookies);
 }
 
 static int lov_llog_origin_connect(struct llog_ctxt *ctxt,
index 3a4e2bd..4609817 100755 (executable)
@@ -1821,6 +1821,27 @@ test_80b() {
 }
 run_test 80b "write replay with changed data (checksum resend)"
 
+test_81a() {
+    mkdir -p $DIR/$tdir
+    createmany -o $DIR/$tdir/$tfile- 10 || return 1
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD       0x140
+    do_facet mds "lctl set_param fail_loc=0x80000140"
+    unlinkmany $DIR/$tdir/$tfile- 10 || return 2
+}
+run_test 81a "fail log_add during unlink recovery"
+
+test_81b() {
+    mkdir -p $DIR/$tdir
+    createmany -o $DIR/$tdir/$tfile- 10 || return 1
+    replay_barrier mds
+    unlinkmany $DIR/$tdir/$tfile- 10 || return 2
+#define OBD_FAIL_MDS_FAIL_LOV_LOG_ADD       0x140
+    do_facet mds "lctl set_param fail_loc=0x80000140"
+    fail mds
+}
+run_test 81b "fail log_add during unlink recovery"
+
+
 equals_msg `basename $0`: test complete, cleaning up
 check_and_cleanup_lustre
 [ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true