Whamcloud - gitweb
LU-11418 osd-zfs: call stop_cb if transaction start fail 48/33248/4
authorLai Siyao <lai.siyao@intel.com>
Thu, 30 Aug 2018 06:11:42 +0000 (14:11 +0800)
committerOleg Drokin <green@whamcloud.com>
Tue, 23 Oct 2018 05:18:44 +0000 (05:18 +0000)
osd_trans_stop() should call osd_trans_stop_cb() if transaction is
not successfully started.

Improve debug messages for distribute transaction.

Add sanity 416 for this.

Get rid of ot_write_commit which is useless.

Signed-off-by: Lai Siyao <lai.siyao@whamcloud.com>
Change-Id: I35da81ebd2c9e97c12ae52bd4faed60393cd67d6
Reviewed-on: https://review.whamcloud.com/33248
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Jenkins
Reviewed-by: Alex Zhuravlev <bzzz@whamcloud.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
lustre/include/obd_support.h
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-zfs/osd_handler.c
lustre/osd-zfs/osd_internal.h
lustre/osd-zfs/osd_io.c
lustre/target/update_trans.c
lustre/tests/sanity.sh

index d18c609..e9dd33e 100644 (file)
@@ -275,6 +275,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_OSD_NO_OI_ENTRY                       0x198
 #define OBD_FAIL_OSD_INDEX_CRASH                       0x199
 
+#define OBD_FAIL_OSD_TXN_START                         0x19a
+
 #define OBD_FAIL_OFD_SET_OID                           0x1e0
 
 #define OBD_FAIL_OST                     0x200
index 90af137..86c6fbf 100644 (file)
@@ -1894,6 +1894,9 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                        oh->ot_credits = osd_transaction_size(dev);
        }
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_TXN_START))
+               GOTO(out, rc = -EIO);
+
        /*
         * XXX temporary stuff. Some abstraction layer should
         * be used.
index 3a156b6..4792b89 100644 (file)
@@ -194,8 +194,10 @@ static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb)
 static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
                           struct thandle *th)
 {
-       struct osd_thandle      *oh;
-       int                     rc;
+       struct osd_device *osd = osd_dt_dev(d);
+       struct osd_thandle *oh;
+       int rc;
+
        ENTRY;
 
        oh = container_of0(th, struct osd_thandle, ot_super);
@@ -203,17 +205,19 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d,
        LASSERT(oh->ot_tx);
 
        rc = dt_txn_hook_start(env, d, th);
-       if (rc != 0)
+       if (rc != 0) {
+               CERROR("%s: dt_txn_hook_start failed: rc = %d\n",
+                       osd->od_svname, rc);
                RETURN(rc);
+       }
 
-       if (oh->ot_write_commit && OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC))
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_TXN_START))
                /* Unlike ldiskfs, ZFS checks for available space and returns
                 * -ENOSPC when assigning txg */
-               RETURN(-ENOSPC);
+               RETURN(-EIO);
 
        rc = -dmu_tx_assign(oh->ot_tx, TXG_WAIT);
        if (unlikely(rc != 0)) {
-               struct osd_device *osd = osd_dt_dev(d);
                /* dmu will call commit callback with error code during abort */
                if (!lu_device_is_md(&d->dd_lu_dev) && rc == -ENOSPC)
                        CERROR("%s: failed to start transaction due to ENOSPC"
@@ -292,6 +296,8 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
 
        if (oh->ot_assigned == 0) {
                LASSERT(oh->ot_tx);
+               CDEBUG(D_OTHER, "%s: transaction is aborted\n", osd->od_svname);
+               osd_trans_stop_cb(oh, th->th_result);
                dmu_tx_abort(oh->ot_tx);
                osd_object_sa_dirty_rele(env, oh);
                osd_unlinked_list_emptify(env, osd, &unlinked, false);
index d0656d3..3b53401 100644 (file)
@@ -279,8 +279,7 @@ struct osd_thandle {
        struct list_head         ot_sa_list;
        dmu_tx_t                *ot_tx;
        struct lquota_trans      ot_quota_trans;
-       __u32                    ot_write_commit:1,
-                                ot_assigned:1;
+       __u32                    ot_assigned:1;
 };
 
 #define OSD_OI_NAME_SIZE        24
index 1d9de36..a8bbd6b 100644 (file)
@@ -692,8 +692,6 @@ static int osd_declare_write_commit(const struct lu_env *env,
                space += osd_roundup2blocksz(size, offset, blksz);
        }
 
-       oh->ot_write_commit = 1; /* used in osd_trans_start() for fail_loc */
-
        /* backend zfs filesystem might be configured to store multiple data
         * copies */
        space  *= osd->od_os->os_copies;
@@ -833,6 +831,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt,
            osd->od_readcache_max_filesize)
                drop_cache = 1;
 
+       if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC))
+               RETURN(-ENOSPC);
+
        /* LU-8791: take oo_guard to avoid the deadlock that changing block
         * size and assigning arcbuf take place at the same time.
         *
index 154da72..b8150fa 100644 (file)
@@ -82,9 +82,11 @@ static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt,
        list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) {
                struct sub_thandle_cookie *stc;
 
-               CDEBUG(mask, "st %p obd %s committed %d stopped %d sub_th %p\n",
+               CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d "
+                      "result %d sub_th %p\n",
                       st, st->st_dt->dd_lu_dev.ld_obd->obd_name,
-                      st->st_committed, st->st_stopped, st->st_sub_th);
+                      st->st_committed, st->st_started, st->st_stopped,
+                      st->st_result, st->st_sub_th);
 
                list_for_each_entry(stc, &st->st_cookie_list, stc_list) {
                        CDEBUG(mask, " cookie "DFID".%u\n",
@@ -1018,6 +1020,8 @@ stop_master_trans:
                        sub_trans_commit_cb_internal(tmt,
                                                master_st->st_sub_th, rc);
                if (rc < 0) {
+                       CERROR("%s: stop trans failed: rc = %d\n",
+                              master_dev->dd_lu_dev.ld_obd->obd_name, rc);
                        th->th_result = rc;
                        GOTO(stop_other_trans, rc);
                } else if (tur != NULL && tur->tur_update_records != NULL) {
@@ -1055,6 +1059,9 @@ stop_master_trans:
 
                        rc = sub_updates_write(env, lur, st);
                        if (rc < 0) {
+                               CERROR("%s: write updates failed: rc = %d\n",
+                                      st->st_dt->dd_lu_dev.ld_obd->obd_name,
+                                      rc);
                                th->th_result = rc;
                                break;
                        }
@@ -1074,8 +1081,12 @@ stop_other_trans:
                st->st_sub_th->th_result = th->th_result;
                rc = dt_trans_stop(env, st->st_sub_th->th_dev,
                                   st->st_sub_th);
-               if (unlikely(rc < 0 && th->th_result == 0))
-                       th->th_result = rc;
+               if (rc < 0) {
+                       CERROR("%s: stop trans failed: rc = %d\n",
+                              st->st_dt->dd_lu_dev.ld_obd->obd_name, rc);
+                       if (th->th_result == 0)
+                               th->th_result = rc;
+               }
        }
 
        rc = top_trans_wait_result(top_th);
index c55e8a7..6dd2c7c 100755 (executable)
@@ -19170,6 +19170,20 @@ test_415() {
 }
 run_test 415 "lock revoke is not missing"
 
+
+test_416() {
+       [ $(lustre_version_code mds1) -lt $(version_code 2.11.55) ] &&
+               skip "Need server version at least 2.11.55"
+
+       # define OBD_FAIL_OSD_TXN_START    0x19a
+       do_facet mds1 lctl set_param fail_loc=0x19a
+
+       lfs mkdir -c $MDSCOUNT $DIR/$tdir
+
+       true
+}
+run_test 416 "transaction start failure won't cause system hung"
+
 prep_801() {
        [[ $(lustre_version_code mds1) -lt $(version_code 2.9.55) ]] ||
        [[ $(lustre_version_code ost1) -lt $(version_code 2.9.55) ]] &&