From 8bd04b4e57663e02e9384727eca4df84d99edb3c Mon Sep 17 00:00:00 2001 From: Alexey Zhuravlev Date: Mon, 9 Sep 2019 17:00:05 +0300 Subject: [PATCH] LU-12722 target: disable recovery for local clients when client is running on a server node, then the local services can't rely on that client in the contex of recovery - such a client dies with the node, can't replay requests and states and then the restarting server has to wait till recovery expires which doesn't make any sense. so the servers should recogize local clients and exclude them from recovery (i.e. don't make them part of last_rcvd). for the purpose of local testing a special mount option "local_recov" has been added to {MDS|OST}_MOUNT_OPTS in tests/cfg/local.sh to save local testing when everyting is running within a single node. Signed-off-by: Alexey Zhuravlev Change-Id: I4cb906c44c1192933f7d77dc782160e426e9efde Reviewed-on: https://review.whamcloud.com/36025 Reviewed-by: Andreas Dilger Tested-by: jenkins Tested-by: Maloo Reviewed-by: Li Xi Reviewed-by: Oleg Drokin --- lustre/include/lu_target.h | 4 +- lustre/include/lustre_disk.h | 1 + lustre/include/lustre_export.h | 4 +- lustre/mdt/mdt_handler.c | 7 +++- lustre/mdt/mdt_lproc.c | 29 ++++++++++++++ lustre/obdclass/lprocfs_status_server.c | 1 + lustre/obdclass/obd_mount.c | 3 ++ lustre/ofd/ofd_dev.c | 23 +++++++---- lustre/target/tgt_lastrcvd.c | 29 +++++++++++++- lustre/tests/cfg/local.sh | 6 ++- lustre/tests/conf-sanity.sh | 2 +- lustre/tests/recovery-small.sh | 67 +++++++++++++++++++++++++++++++-- lustre/tests/sanity-lfsck.sh | 4 +- lustre/tests/test-framework.sh | 20 +++++++++- 14 files changed, 177 insertions(+), 23 deletions(-) diff --git a/lustre/include/lu_target.h b/lustre/include/lu_target.h index 10377ba..5aa9aa2 100644 --- a/lustre/include/lu_target.h +++ b/lustre/include/lu_target.h @@ -171,7 +171,9 @@ struct lu_target { unsigned int lut_syncjournal:1, lut_sync_lock_cancel:2, /* e.g. OST node */ - lut_no_reconstruct:1; + lut_no_reconstruct:1, + /* enforce recovery for local clients */ + lut_local_recovery:1; /** last_rcvd file */ struct dt_object *lut_last_rcvd; /* transaction callbacks */ diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 385cc1e9..18d5241 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -107,6 +107,7 @@ struct lustre_mount_data { #define LMD_FLG_HSM 0x4000 /* Start coordinator */ #define LMD_FLG_DEV_RDONLY 0x8000 /* discard modification quitely */ #define LMD_FLG_NO_PRECREATE 0x10000 /* do not allow OST object creation */ +#define LMD_FLG_LOCAL_RECOV 0x20000 /* force recovery for local clients */ #define lmd_is_client(x) ((x)->lmd_flags & LMD_FLG_CLIENT) diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index f819770..8f45dd4 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -277,7 +277,9 @@ struct obd_export { exp_need_mne_swab:1, /* The export already got final replay ping * request. */ - exp_replay_done:1; + exp_replay_done:1, + /* local client with recovery disabled */ + exp_no_recovery:1; /* also protected by exp_lock */ enum lustre_sec_part exp_sp_peer; struct sptlrpc_flavor exp_flvr; /* current */ diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 17a5e44..5c867d7 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -5413,12 +5413,12 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, RETURN(-EFAULT); } else { lsi = s2lsi(lmi->lmi_sb); + LASSERT(lsi->lsi_lmd); /* CMD is supported only in IAM mode */ LASSERT(num); node_id = simple_strtol(num, NULL, 10); obd->u.obt.obt_magic = OBT_MAGIC; - if (lsi->lsi_lmd != NULL && - lsi->lsi_lmd->lmd_flags & LMD_FLG_SKIP_LFSCK) + if (lsi->lsi_lmd->lmd_flags & LMD_FLG_SKIP_LFSCK) m->mdt_skip_lfsck = 1; } @@ -5595,6 +5595,9 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, if (ldlm_timeout == LDLM_TIMEOUT_DEFAULT) ldlm_timeout = MDS_LDLM_TIMEOUT_DEFAULT; + if ((lsi->lsi_lmd->lmd_flags & LMD_FLG_LOCAL_RECOV)) + m->mdt_lut.lut_local_recovery = 1; + RETURN(0); err_procfs: mdt_tunables_fini(m); diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index a3adf18..17fe08b 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -531,6 +531,34 @@ static ssize_t commit_on_sharing_store(struct kobject *kobj, } LUSTRE_RW_ATTR(commit_on_sharing); +static ssize_t local_recovery_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + + return scnprintf(buf, PAGE_SIZE, "%u\n", + obd->u.obt.obt_lut->lut_local_recovery); +} + +static ssize_t local_recovery_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, size_t count) +{ + struct obd_device *obd = container_of(kobj, struct obd_device, + obd_kset.kobj); + bool val; + int rc; + + rc = kstrtobool(buffer, &val); + if (rc) + return rc; + + obd->u.obt.obt_lut->lut_local_recovery = !!val; + return count; +} +LUSTRE_RW_ATTR(local_recovery); + static int mdt_root_squash_seq_show(struct seq_file *m, void *data) { struct obd_device *obd = m->private; @@ -1085,6 +1113,7 @@ static struct attribute *mdt_attrs[] = { &lustre_attr_enable_dir_migration.attr, &lustre_attr_enable_remote_rename.attr, &lustre_attr_commit_on_sharing.attr, + &lustre_attr_local_recovery.attr, &lustre_attr_async_commit_count.attr, &lustre_attr_sync_count.attr, &lustre_attr_dom_lock.attr, diff --git a/lustre/obdclass/lprocfs_status_server.c b/lustre/obdclass/lprocfs_status_server.c index d4e59e0..03edc1d 100644 --- a/lustre/obdclass/lprocfs_status_server.c +++ b/lustre/obdclass/lprocfs_status_server.c @@ -167,6 +167,7 @@ static int obd_export_flags2str(struct obd_export *exp, struct seq_file *m) flag2str(exp, in_recovery); flag2str(exp, disconnected); flag2str(exp, connecting); + flag2str(exp, no_recovery); return 0; } diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index 0d27e95..62e1de8 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -1454,6 +1454,9 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) strlcat(lmd->lmd_params, " ", LMD_PARAMS_MAXLEN); s3 = s1 + 6 + length; clear++; + } else if (strncmp(s1, "localrecov", 10) == 0) { + lmd->lmd_flags |= LMD_FLG_LOCAL_RECOV; + clear++; } else if (strncmp(s1, "osd=", 4) == 0) { rc = lmd_parse_string(&lmd->lmd_osd_type, s1 + 4); if (rc) diff --git a/lustre/ofd/ofd_dev.c b/lustre/ofd/ofd_dev.c index a48efb0..c8a5140 100644 --- a/lustre/ofd/ofd_dev.c +++ b/lustre/ofd/ofd_dev.c @@ -161,7 +161,8 @@ out: * \retval negative value on error */ static int ofd_stack_init(const struct lu_env *env, - struct ofd_device *m, struct lustre_cfg *cfg) + struct ofd_device *m, struct lustre_cfg *cfg, + u32 *lmd_flags) { const char *dev = lustre_cfg_string(cfg, 0); struct lu_device *d; @@ -180,11 +181,13 @@ static int ofd_stack_init(const struct lu_env *env, } lmd = s2lsi(lmi->lmi_sb)->lsi_lmd; - if (lmd && lmd->lmd_flags & LMD_FLG_SKIP_LFSCK) - m->ofd_skip_lfsck = 1; - - if (lmd && lmd->lmd_flags & LMD_FLG_NO_PRECREATE) - m->ofd_no_precreate = 1; + if (lmd) { + if (lmd->lmd_flags & LMD_FLG_SKIP_LFSCK) + m->ofd_skip_lfsck = 1; + if (lmd->lmd_flags & LMD_FLG_NO_PRECREATE) + m->ofd_no_precreate = 1; + *lmd_flags = lmd->lmd_flags; + } /* find bottom osd */ OBD_ALLOC(osdname, MTI_NAME_MAXLEN); @@ -2841,6 +2844,7 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, struct lu_fid fid; struct nm_config_file *nodemap_config; struct obd_device_target *obt; + u32 lmd_flags = 0; int rc; ENTRY; @@ -2896,7 +2900,7 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, if (info == NULL) RETURN(-EFAULT); - rc = ofd_stack_init(env, m, cfg); + rc = ofd_stack_init(env, m, cfg, &lmd_flags); if (rc) { CERROR("%s: can't init device stack, rc %d\n", obd->obd_name, rc); @@ -2930,6 +2934,11 @@ static int ofd_init0(const struct lu_env *env, struct ofd_device *m, if (rc) GOTO(err_free_ns, rc); + if (lmd_flags & LMD_FLG_SKIP_LFSCK) + m->ofd_skip_lfsck = 1; + if (lmd_flags & LMD_FLG_LOCAL_RECOV) + m->ofd_lut.lut_local_recovery = 1; + rc = ofd_tunables_init(m); if (rc) GOTO(err_fini_lut, rc); diff --git a/lustre/target/tgt_lastrcvd.c b/lustre/target/tgt_lastrcvd.c index b3a7e71..97f76d3 100644 --- a/lustre/target/tgt_lastrcvd.c +++ b/lustre/target/tgt_lastrcvd.c @@ -944,6 +944,25 @@ static int tgt_last_commit_cb_add(struct thandle *th, struct lu_target *tgt, return rc ? rc : exp->exp_need_sync; } +static int tgt_is_local_client(const struct lu_env *env, + struct obd_export *exp) +{ + struct lu_target *tgt = class_exp2tgt(exp); + struct tgt_session_info *tsi = tgt_ses_info(env); + struct ptlrpc_request *req = tgt_ses_req(tsi); + + if (tgt->lut_local_recovery) + return 0; + if (!req) + return 0; + if (!LNetIsPeerLocal(req->rq_peer.nid)) + return 0; + if (exp_connect_flags(exp) & OBD_CONNECT_MDS) + return 0; + + return 1; +} + /** * Add new client to the last_rcvd upon new connection. * @@ -965,6 +984,13 @@ int tgt_client_new(const struct lu_env *env, struct obd_export *exp) if (exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) RETURN(0); + if (tgt_is_local_client(env, exp)) { + LCONSOLE_WARN("%s: local client %s w/o recovery\n", + exp->exp_obd->obd_name, ted->ted_lcd->lcd_uuid); + exp->exp_no_recovery = 1; + RETURN(0); + } + /* the bitmap operations can handle cl_idx > sizeof(long) * 8, so * there's no need for extra complication here */ @@ -1092,7 +1118,8 @@ int tgt_client_del(const struct lu_env *env, struct obd_export *exp) /* XXX if lcd_uuid were a real obd_uuid, I could use obd_uuid_equals */ if (!strcmp((char *)ted->ted_lcd->lcd_uuid, (char *)tgt->lut_obd->obd_uuid.uuid) || - exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT) + exp_connect_flags(exp) & OBD_CONNECT_LIGHTWEIGHT || + exp->exp_no_recovery) RETURN(0); /* Slot may be not yet assigned, use case is race between Client diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index df6c5c3..fe73fd5 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -34,7 +34,8 @@ MDSSIZE=${MDSSIZE:-250000} # MDSOPT=${MDSOPT:-} MDS_FS_MKFS_OPTS=${MDS_FS_MKFS_OPTS:-} -MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:-} +# use localrecov to enable recovery on local clients, LU-12722 +MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:--olocalrecov} # _MOUNT_FS_OPTS is the mount options specified when formatting # the underlying device by argument "--mountfsoptions" MDS_MOUNT_FS_OPTS=${MDS_MOUNT_FS_OPTS:-} @@ -50,7 +51,8 @@ OSTDEVBASE=${OSTDEVBASE:-$TMP/${FSNAME}-ost} OSTSIZE=${OSTSIZE:-400000} OSTOPT=${OSTOPT:-} OST_FS_MKFS_OPTS=${OST_FS_MKFS_OPTS:-} -OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-} +# use localrecov to enable recovery on local clients, LU-12722 +OST_MOUNT_OPTS=${OST_MOUNT_OPTS:--olocalrecov} OST_MOUNT_FS_OPTS=${OST_MOUNT_FS_OPTS:-} OST_INDEX_LIST=${OST_INDEX_LIST:-} # Can specify individual ost devs with diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index c9355e6..690e1ea 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -6123,7 +6123,7 @@ test_83() { error "format ost1 error" if ! test -b $dev; then - mnt_opts=$(csa_add "$OST_MOUNT_OPTS" -o loop) + mnt_opts=$(csa_add "$OST_MOUNT_FS_OPTS" -o loop) fi echo "mnt_opts $mnt_opts" do_facet ost1 mount -t "$ost1_FSTYPE" $dev \ diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 7afd0c7..aff307a 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -2262,13 +2262,15 @@ test_110k() { #define OBD_FAIL_FLD_QUERY_REQ 0x1103 do_facet mds2 lctl set_param fail_loc=0x1103 - start mds2 $(mdsdevname 2) -o abort_recovery || + local OPTS="$MDS_MOUNT_OPTS -o abort_recovery" + start mds2 $(mdsdevname 2) $OPTS || error "start MDS with abort_recovery should succeed" do_facet mds2 lctl set_param fail_loc=0 # cleanup stop mds2 || error "cleanup: stop mds2 failed" - start mds2 $(mdsdevname 2) || error "cleanup: start mds2 failed" + start mds2 $(mdsdevname 2) $MDS_MOUNT_OPTS || + error "cleanup: start mds2 failed" zconf_mount $(hostname) $MOUNT || error "cleanup: mount failed" client_up || error "post-failover df failed" } @@ -2283,10 +2285,10 @@ test_111 () #define OBD_FAIL_MDS_CHANGELOG_INIT 0x151 do_facet $SINGLEMDS lctl set_param fail_loc=0x151 stop $SINGLEMDS || error "stop MDS failed" - start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) && + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS && error "start MDS should fail" do_facet $SINGLEMDS lctl set_param fail_loc=0 - start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) || + start $SINGLEMDS $(mdsdevname ${SINGLEMDS//mds/}) $MDS_MOUNT_OPTS || error "start MDS failed" } run_test 111 "mdd setup fail should not cause umount oops" @@ -2913,6 +2915,63 @@ test_139() { } run_test 139 "corrupted catid won't cause crash" +test_140a() { + [ $MDS1_VERSION -lt $(version_code 2.12.58) ] && + skip "Need MDS version at least 2.13.50" + + slr=$(do_facet mds1 \ + $LCTL get_param -n mdt.$FSNAME-MDT0000.local_recovery) + stack_trap "do_facet mds1 $LCTL set_param \ + mdt.*.local_recovery=$slr" EXIT + + # disable recovery for local clients + # so local clients should be marked with no_recovery flag + do_facet mds1 $LCTL set_param mdt.*.local_recovery=0 + mount_mds_client + + local cnt + cnt=$(do_facet mds1 $LCTL get_param "mdt.*.exports.*.export" | + grep export_flags.*no_recovery | wc -l) + echo "$cnt clients with recovery disabled" + umount_mds_client + [ $cnt -eq 0 ] && error "no clients with recovery disabled" + + # enable recovery for local clients + # so no local clients should be marked with no_recovery flag + do_facet mds1 $LCTL set_param mdt.*.local_recovery=1 + mount_mds_client + + cnt=$(do_facet mds1 $LCTL get_param "mdt.*.exports.*.export" | + grep export_flags.*no_recovery | wc -l) + echo "$cnt clients with recovery disabled" + umount_mds_client + [ $cnt -eq 0 ] || error "$cnt clients with recovery disabled" +} +run_test 140a "local mount is flagged properly" + +test_140b() { + [ $MDS1_VERSION -lt $(version_code 2.12.58) ] && + skip "Need MDS version at least 2.13.50" + + slr=$(do_facet mds1 \ + $LCTL get_param -n mdt.$FSNAME-MDT0000.local_recovery) + stack_trap "do_facet mds1 $LCTL set_param \ + mdt.*.local_recovery=$slr" EXIT + + # disable recovery for local clients + do_facet mds1 $LCTL set_param mdt.*.local_recovery=0 + + mount_mds_client + replay_barrier mds1 + umount_mds_client + local before=$SECONDS + fail mds1 + local after=$SECONDS + (( $after-$before < $TIMEOUT*2 )) || + error "recovery took too long" $((after-bsfore)) $TIMEOUT +} +run_test 140b "local mount is excluded from recovery" + complete $SECONDS check_and_cleanup_lustre exit_status diff --git a/lustre/tests/sanity-lfsck.sh b/lustre/tests/sanity-lfsck.sh index 1987a48..e6048d5 100644 --- a/lustre/tests/sanity-lfsck.sh +++ b/lustre/tests/sanity-lfsck.sh @@ -79,8 +79,8 @@ SHOW_LAYOUT="do_facet $SINGLEMDS \ $LCTL get_param -n mdd.${MDT_DEV}.lfsck_layout" SHOW_LAYOUT_ON_OST="do_facet ost1 \ $LCTL get_param -n obdfilter.${OST_DEV}.lfsck_layout" -MOUNT_OPTS_SCRUB="-o user_xattr" -MOUNT_OPTS_NOSCRUB="-o user_xattr,noscrub" +MOUNT_OPTS_SCRUB="$MDS_MOUNT_OPTS -o user_xattr" +MOUNT_OPTS_NOSCRUB="$MDS_MOUNT_OPTS -o user_xattr,noscrub" MOUNT_OPTS_SKIP_LFSCK="-o user_xattr,skip_lfsck" lfsck_prep() { diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 32dbc87..23b01eb 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -2331,6 +2331,22 @@ zconf_umount() { fi } +# Mount the file system on the MDS +mount_mds_client() { + local mds_HOST=${SINGLEMDS}_HOST + echo $mds_HOST + do_facet $SINGLEMDS "mkdir -p $MOUNT2" + zconf_mount $mds1_HOST $MOUNT2 $MOUNT_OPTS || + error "unable to mount $MOUNT2 on MDS" +} + +# Unmount the file system on the MDS +umount_mds_client() { + local mds_HOST=${SINGLEMDS}_HOST + zconf_umount $mds1_HOST $MOUNT2 + do_facet $SINGLEMDS "rm -rf $MOUNT2" +} + # nodes is comma list sanity_mount_check_nodes () { local nodes=$1 @@ -8559,7 +8575,7 @@ mds_backup_restore() { local rcmd="do_facet $facet" local metaea=${TMP}/backup_restore.ea local metadata=${TMP}/backup_restore.tgz - local opts=${MDS_MOUNT_OPTS} + local opts=${MDS_MOUNT_FS_OPTS} local svc=${facet}_svc if ! ${rcmd} test -b ${devname}; then @@ -8616,7 +8632,7 @@ mds_remove_ois() { local devname=$(mdsdevname $(facet_number $facet)) local mntpt=$(facet_mntpt brpt) local rcmd="do_facet $facet" - local opts=${MDS_MOUNT_OPTS} + local opts=${MDS_MOUNT_FS_OPTS} if ! ${rcmd} test -b ${devname}; then opts=$(csa_add "$opts" -o loop) -- 1.8.3.1