From: Lai Siyao Date: Thu, 30 Aug 2018 06:11:42 +0000 (+0800) Subject: LU-11418 osd-zfs: call stop_cb if transaction start fail X-Git-Tag: 2.12.0-RC1~163 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=d614ff561cb6bd9a9b35310c99dd991c55797d97 LU-11418 osd-zfs: call stop_cb if transaction start fail osd_trans_stop() should call osd_trans_stop_cb() if transaction is not successfully started. Improve debug messages for distribute transaction. Add sanity 416 for this. Get rid of ot_write_commit which is useless. Signed-off-by: Lai Siyao Change-Id: I35da81ebd2c9e97c12ae52bd4faed60393cd67d6 Reviewed-on: https://review.whamcloud.com/33248 Reviewed-by: Andreas Dilger Tested-by: Jenkins Reviewed-by: Alex Zhuravlev Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index d18c609..e9dd33e 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -275,6 +275,8 @@ extern char obd_jobid_var[]; #define OBD_FAIL_OSD_NO_OI_ENTRY 0x198 #define OBD_FAIL_OSD_INDEX_CRASH 0x199 +#define OBD_FAIL_OSD_TXN_START 0x19a + #define OBD_FAIL_OFD_SET_OID 0x1e0 #define OBD_FAIL_OST 0x200 diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 90af137..86c6fbf 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1894,6 +1894,9 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, oh->ot_credits = osd_transaction_size(dev); } + if (OBD_FAIL_CHECK(OBD_FAIL_OSD_TXN_START)) + GOTO(out, rc = -EIO); + /* * XXX temporary stuff. Some abstraction layer should * be used. diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 3a156b6..4792b89 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -194,8 +194,10 @@ static int osd_trans_cb_add(struct thandle *th, struct dt_txn_commit_cb *dcb) static int osd_trans_start(const struct lu_env *env, struct dt_device *d, struct thandle *th) { - struct osd_thandle *oh; - int rc; + struct osd_device *osd = osd_dt_dev(d); + struct osd_thandle *oh; + int rc; + ENTRY; oh = container_of0(th, struct osd_thandle, ot_super); @@ -203,17 +205,19 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, LASSERT(oh->ot_tx); rc = dt_txn_hook_start(env, d, th); - if (rc != 0) + if (rc != 0) { + CERROR("%s: dt_txn_hook_start failed: rc = %d\n", + osd->od_svname, rc); RETURN(rc); + } - if (oh->ot_write_commit && OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) + if (OBD_FAIL_CHECK(OBD_FAIL_OSD_TXN_START)) /* Unlike ldiskfs, ZFS checks for available space and returns * -ENOSPC when assigning txg */ - RETURN(-ENOSPC); + RETURN(-EIO); rc = -dmu_tx_assign(oh->ot_tx, TXG_WAIT); if (unlikely(rc != 0)) { - struct osd_device *osd = osd_dt_dev(d); /* dmu will call commit callback with error code during abort */ if (!lu_device_is_md(&d->dd_lu_dev) && rc == -ENOSPC) CERROR("%s: failed to start transaction due to ENOSPC" @@ -292,6 +296,8 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, if (oh->ot_assigned == 0) { LASSERT(oh->ot_tx); + CDEBUG(D_OTHER, "%s: transaction is aborted\n", osd->od_svname); + osd_trans_stop_cb(oh, th->th_result); dmu_tx_abort(oh->ot_tx); osd_object_sa_dirty_rele(env, oh); osd_unlinked_list_emptify(env, osd, &unlinked, false); diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index d0656d3..3b53401 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -279,8 +279,7 @@ struct osd_thandle { struct list_head ot_sa_list; dmu_tx_t *ot_tx; struct lquota_trans ot_quota_trans; - __u32 ot_write_commit:1, - ot_assigned:1; + __u32 ot_assigned:1; }; #define OSD_OI_NAME_SIZE 24 diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index 1d9de36..a8bbd6b 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -692,8 +692,6 @@ static int osd_declare_write_commit(const struct lu_env *env, space += osd_roundup2blocksz(size, offset, blksz); } - oh->ot_write_commit = 1; /* used in osd_trans_start() for fail_loc */ - /* backend zfs filesystem might be configured to store multiple data * copies */ space *= osd->od_os->os_copies; @@ -833,6 +831,9 @@ static int osd_write_commit(const struct lu_env *env, struct dt_object *dt, osd->od_readcache_max_filesize) drop_cache = 1; + if (OBD_FAIL_CHECK(OBD_FAIL_OST_MAPBLK_ENOSPC)) + RETURN(-ENOSPC); + /* LU-8791: take oo_guard to avoid the deadlock that changing block * size and assigning arcbuf take place at the same time. * diff --git a/lustre/target/update_trans.c b/lustre/target/update_trans.c index 154da72..b8150fa 100644 --- a/lustre/target/update_trans.c +++ b/lustre/target/update_trans.c @@ -82,9 +82,11 @@ static void top_multiple_thandle_dump(struct top_multiple_thandle *tmt, list_for_each_entry(st, &tmt->tmt_sub_thandle_list, st_sub_list) { struct sub_thandle_cookie *stc; - CDEBUG(mask, "st %p obd %s committed %d stopped %d sub_th %p\n", + CDEBUG(mask, "st %p obd %s committed %d started %d stopped %d " + "result %d sub_th %p\n", st, st->st_dt->dd_lu_dev.ld_obd->obd_name, - st->st_committed, st->st_stopped, st->st_sub_th); + st->st_committed, st->st_started, st->st_stopped, + st->st_result, st->st_sub_th); list_for_each_entry(stc, &st->st_cookie_list, stc_list) { CDEBUG(mask, " cookie "DFID".%u\n", @@ -1018,6 +1020,8 @@ stop_master_trans: sub_trans_commit_cb_internal(tmt, master_st->st_sub_th, rc); if (rc < 0) { + CERROR("%s: stop trans failed: rc = %d\n", + master_dev->dd_lu_dev.ld_obd->obd_name, rc); th->th_result = rc; GOTO(stop_other_trans, rc); } else if (tur != NULL && tur->tur_update_records != NULL) { @@ -1055,6 +1059,9 @@ stop_master_trans: rc = sub_updates_write(env, lur, st); if (rc < 0) { + CERROR("%s: write updates failed: rc = %d\n", + st->st_dt->dd_lu_dev.ld_obd->obd_name, + rc); th->th_result = rc; break; } @@ -1074,8 +1081,12 @@ stop_other_trans: st->st_sub_th->th_result = th->th_result; rc = dt_trans_stop(env, st->st_sub_th->th_dev, st->st_sub_th); - if (unlikely(rc < 0 && th->th_result == 0)) - th->th_result = rc; + if (rc < 0) { + CERROR("%s: stop trans failed: rc = %d\n", + st->st_dt->dd_lu_dev.ld_obd->obd_name, rc); + if (th->th_result == 0) + th->th_result = rc; + } } rc = top_trans_wait_result(top_th); diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index c55e8a7..6dd2c7c 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -19170,6 +19170,20 @@ test_415() { } run_test 415 "lock revoke is not missing" + +test_416() { + [ $(lustre_version_code mds1) -lt $(version_code 2.11.55) ] && + skip "Need server version at least 2.11.55" + + # define OBD_FAIL_OSD_TXN_START 0x19a + do_facet mds1 lctl set_param fail_loc=0x19a + + lfs mkdir -c $MDSCOUNT $DIR/$tdir + + true +} +run_test 416 "transaction start failure won't cause system hung" + prep_801() { [[ $(lustre_version_code mds1) -lt $(version_code 2.9.55) ]] || [[ $(lustre_version_code ost1) -lt $(version_code 2.9.55) ]] &&