From: Mikhail Pershin Date: Wed, 12 Oct 2022 09:22:14 +0000 (+0300) Subject: LU-16232 scripts: changelog/updatelog emergency cleanup X-Git-Tag: 2.15.7-RC1~24 X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=470d06cf5ca7a1eceb30e04d30b350465ffd336c;p=fs%2Flustre-release.git LU-16232 scripts: changelog/updatelog emergency cleanup Emergency cleanup scripts for situations when llogs are corrupted and can't be cleaned up in a normal way. In such cases the recommendation is to remove/truncate those llogs. Scripts make all needed steps and have debugging option to collect llogs for further analysis. Scripts possible actions are: - dry-run mode to check all actions and files affected - create archive with all llogs for analysis - remove llogs including all plain llogs Lustre-change: https://review.whamcloud.com/48838 Lustre-commit: b533700add91fe4220f50d057a470e0b6f4893c9 Test-Parameters: trivial Signed-off-by: Mikhail Pershin Change-Id: I3b197179bc54f451e3c5d7db36b6f1c56c076856 Reviewed-by: Andreas Dilger Reviewed-by: Jian Yu Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/58011 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre.spec.in b/lustre.spec.in index 1cc3222..9c5ed6c 100644 --- a/lustre.spec.in +++ b/lustre.spec.in @@ -747,6 +747,8 @@ echo '%{_libdir}/lustre/tests/lutf/*' >>lustre-tests.files %{_libexecdir}/lustre/lc_common %{_libexecdir}/lustre/haconfig %{_bindir}/lustre_req_history +%{_bindir}/remove_changelog +%{_bindir}/remove_updatelog %endif %{_bindir}/llobdstat diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am index cb30006..cdba9b2 100644 --- a/lustre/scripts/Makefile.am +++ b/lustre/scripts/Makefile.am @@ -60,7 +60,7 @@ bin_SCRIPTS = lfs_migrate if SERVER sbin_SCRIPTS += $(genscripts) lc_mon lhbadm lc_servip -bin_SCRIPTS += lustre_req_history +bin_SCRIPTS += lustre_req_history remove_changelog remove_updatelog hadir = $(sysconfdir)/ha.d/resource.d ha_SCRIPTS = Lustre.ha_v2 @@ -94,7 +94,8 @@ EXTRA_DIST = license-status lustre_rmmod ldev lc_mon lhbadm \ lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \ zfsobj2fid ko2iblnd-probe ksocklnd-config statechange-lustre.sh \ vdev_attach-lustre.sh vdev_remove-lustre.sh vdev_clear-lustre.sh \ - bash-completion/lustre bash-completion/lctl bash-completion/lfs + bash-completion/lustre bash-completion/lctl bash-completion/lfs \ + remove_changelog remove_updatelog CLEANFILES = $(genscripts) diff --git a/lustre/scripts/remove_changelog b/lustre/scripts/remove_changelog new file mode 100755 index 0000000..0bda063 --- /dev/null +++ b/lustre/scripts/remove_changelog @@ -0,0 +1,137 @@ +#!/bin/bash + +# remove_changelog: emergency remove changelog files from server. +# +# This is emergency tool to cleanup changelogs in server if changelog +# records cannot be removed by regular means, e.g. due to llog corruptions +# +# Tool goes through changelog_catalog and removes all plain llogs listed +# then removes changelog_catalog itself and changelog_users +# Script accept single parameter which is mount point of server FS mounted +# locally, script accepts also --dry-run option to emulate files removal +# +# Steps to cleanup problematic llogs: +# +# 1. mount MDT filesystem locally on server as ldiskfs mount +# 2. run script first in dry-run mode to make sure it parses llogs as needed: +# # bash remove_changelog -n +# 3. save all llogs for analysis: +# # bash remove_changelog -n -z /tmp/llogs_saved +# 4. check that /tmp/llogs_saved.tar.gz exists and has all llogs inside: +# # ls -ali /tmp/llogs_saved.tar.gz +# # tar -tf /tmp/llog_saved.tar.gz +# 5. finally run script to delete all llogs: +# # bash remove_changelog +# +# For better llogs compression xz can be used as well, pass it to the script +# via GZIP env variable: +# # GZIP=xz bash remove_changelog -n -z /tmp/llogs_saved +# Archive name will ends with .xz in that case instead of .gz + + + + +ECHO=echo +PROG=$(basename $0) +LLOG_READER=${LLOG_READER:-llog_reader} +GZIP=${GZIP:-gzip} + +usage() { + cat -- <&2 +usage: remove_changelog [--dry-run|-n] [--help|-h] [--quiet|-q] + [--zip}-z] + --help|-h show this usage message + --dry-run|-n only print the names of files to be removed + --quiet|-q run quietly (don't print filenames or status) + --zip|-z + save all llogs into compressed tar archive with given + name prefix using gzip by default. Other compression + tools can be used via GZIP env variable. + +The 'localmount' argument should be an ldiskfs mounted MDT device mountpoint. + +Examples: + remove_changelog /mnt/mdt0 + remove_changelog --dry-run /mnt/mdt0 + remove_changelog -z /tmp/llogs /mnt/mdt0 +USAGE + exit 1 +} + +OPT_DRYRUN=false +OPT_ARCH="" +OPT_MOUNT="" + +# Examine any long options and arguments +while [ -n "$*" ]; do + arg="$1" + case "$arg" in + -h|--help) usage;; + -n|--dry-run) OPT_DRYRUN=true;; + -q|--quiet) ECHO=:;; + -z|--zip) OPT_ARCH="$2.tar"; shift;; + *) + [ -e "$arg" ] && OPT_MOUNT="$arg" && break + esac + shift +done + +remove_changelog() { + local mntpoint=$OPT_MOUNT + local catalog=${mntpoint}/changelog_catalog + local users=${mntpoint}/changelog_users + local arch=$OPT_ARCH + + if [[ -z $(df -t ldiskfs $mntpoint 2>/dev/null) ]] ; then + echo "$PROG: '$mntpoint' is not ldiskfs mount." + exit 1 + fi + + if $OPT_DRYRUN; then + $ECHO "Dry run was requested, no changes will be applied" + fi + + $ECHO "Scan changelog_catalog at '$mntpoint':" + if [[ ! -f $catalog ]] ; then + echo "$PROG: $catalog doesn't exist already." + else + if [[ ! $(which $LLOG_READER 2>/dev/null) ]] ; then + echo "$PROG: $LLOG_READER is missing." + exit 1 + fi + [[ -z $arch ]] || tar -cf $arch $catalog 2>/dev/null + if (( $(stat -c %s $catalog) >= 8192 )) ; then + while read -r path ; do + [[ -z $arch ]] || + tar -rf $arch ${mntpoint}/$path 2>/dev/null + $ECHO "rm ${mntpoint}/$path" + $OPT_DRYRUN || rm -f ${mntpoint}/$path + done < <($LLOG_READER $catalog | + awk -F "path=" '/path=/ { print $2 }') + else + echo "$PROG: $catalog is too small." + fi + $ECHO "> $catalog" + $OPT_DRYRUN || > $catalog + fi + + if [[ -f $users ]] ; then + [[ -z $arch ]] || tar -rf $arch $users 2>/dev/null + $ECHO "> $users" + $OPT_DRYRUN || > $users + else + echo "$PROG: $user doesn't exist." + fi + if [[ "$arch" ]] ; then + $GZIP -3 $arch + $ECHO "llog archive was created by $GZIP" + fi +} + +if [ -z $OPT_MOUNT ] ; then + echo "Mount is not specified, exiting" + exit 1 +fi +remove_changelog + + diff --git a/lustre/scripts/remove_updatelog b/lustre/scripts/remove_updatelog new file mode 100755 index 0000000..2fad732 --- /dev/null +++ b/lustre/scripts/remove_updatelog @@ -0,0 +1,160 @@ +#!/bin/bash + +# remove_updatelogs: emergency remove MDT updatelog files from server. +# +# This is emergency tool to cleanup updatelogs in server if llog records +# cannot be removed by regular means, e.g. due to llog corruptions +# +# Tool goes the following: +# - goes through update_log catlist to find per-MDT update llog catalog +# - process llog catalog to delete all plain llogs in it +# - truncate or remove related llog catalog after all +# - truncates update_llogs itself if all catalogs were removed +# +# Script required parameter is mount point of server FS mounted locally +# it accepts also optional options as described below in usage() +# +# Steps to cleanup problematic llogs: +# +# 1. mount MDT filesystem locally on server as ldiskfs mount +# 2. run script first in dry-run mode to make sure it parses llogs as needed: +# # bash remove_updatelog -n +# 3. save all llogs for analysis: +# # bash remove_updatelog -n -z /tmp/llogs_saved +# 4. check that /tmp/llogs_saved.tar.gz exists and has all llogs inside: +# # ls -ali /tmp/llogs_saved.tar.gz +# # tar -tf /tmp/llog_saved.tar.gz +# 5. finally run script to delete all llogs: +# # bash remove_updatelog +# +# For better llogs compression xz can be used as well, pass it to the script +# via GZIP env variable: +# # GZIP=xz bash remove_updatelog -n -z /tmp/llogs_saved +# Archive name will ends with .xz in that case instead of .gz + +ECHO=echo +PROG=$(basename $0) +LLOG_READER=${LLOG_READER:-llog_reader} +GZIP=${GZIP:-gzip} + +usage() { + cat -- <&2 +usage: remove_updatelog [--dry-run|-n] [--help|-h] [--quiet|-q] + --help|-h show this usage message + --dry-run|-n only print the names of files to be removed + --quiet|-q run quietly (don't print filenames or status) + --zip|-z + save all llogs into compressed tar archive with given + name prefix using gzip by default. Other compression + tools can be used via GZIP env variable. + +The 'localmount' argument should be an ldiskfs mounted MDT device mountpoint. + +Examples: + remove_updatelog /mnt/mdt0 + remove_updatelog --dry-run /mnt/mdt0 + remove_changelog -z /tmp/llogs /mnt/mdt0 +USAGE + exit 1 +} + +OPT_DRYRUN=false +OPT_ARCH="" +OPT_MOUNT="" +OPT_MDTS=() + +# Examine any long options and arguments +while [ -n "$*" ]; do + arg="$1" + case "$arg" in + -h|--help) usage;; + -n|--dry-run) OPT_DRYRUN=true;; + -q|--quiet) ECHO=:;; + -z|--zip) OPT_ARCH="$2.tar"; shift;; + *) + [ -e "$arg" ] && OPT_MOUNT="$arg" && break + esac + shift +done + +remove_updatelog() { + local mntpoint=$OPT_MOUNT + local catlist=${mntpoint}/update_log + local dir=${mntpoint}/update_log_dir + local arch=$OPT_ARCH + local length=0 + + if [[ -z $(df -t ldiskfs $mntpoint 2>/dev/null) ]] ; then + echo "$PROG: '$mntpoint' is not ldiskfs mount." + exit 1 + fi + + if $OPT_DRYRUN; then + $ECHO "Dry run was requested, no changes will be applied" + fi + + $ECHO "Scan update_log at '$mntpoint':" + if [[ ! -f $catlist ]] ; then + echo "$PROG: $catlist doesn't exist already." + else + read -r -d '' -a OPT_MDTS < <(hexdump -v -e '2/8 " %16x" 2/8 "\n"' $catlist | + awk '{print "[0x"$2":0x"$1":0x0]"}') + + if [[ ! $(which $LLOG_READER 2>/dev/null) ]] ; then + echo "$PROG: $LLOG_READER is missing." + exit 1 + fi + [[ -z $arch ]] || tar -cf $arch $catlist 2>/dev/null + length=${#OPT_MDTS[@]} + for (( i = 0; i < ${length}; i++ )); do + local catalog=$dir/${OPT_MDTS[$i]} + + $ECHO "Processing MDT$i llogs ..." + if [[ ! -f $catalog ]] ; then + echo "$PROG: $catalog doesn't exist already." + continue + fi + [[ -z $arch ]] || tar -rf $arch $catalog 2>/dev/null + if (( $(stat -c %s $catalog) >= 8192 )) ; then + while read -r plain ; do + local path + + # compatibility checks: + # old llog reader reports path in /O + # but correct path in update_log_dir + if [ ${plain:0:1} == 'O' ] ; then + local fid=${plain#"O/"*} + + # old format: O/8589935617/d3/3 + # get sequence and oid in hex: + fid=$(printf "[0x%x:0x%x:0x0]" ${fid%%/*} ${fid##*/}) + path="$dir/$fid" + else + path=$mntpoint/$plain + fi + [[ -z $arch ]] || + tar -rf $arch $path 2>/dev/null + $ECHO "rm -f $path" + $OPT_DRYRUN || rm -f $path + done < <(llog_reader $catalog | + awk -F "path=" '/path=/ { print $2 }') + else + echo "$PROG: $catalog is too small." + fi + $ECHO "> $catalog" + $OPT_DRYRUN || > $catalog + done + fi + if [[ "$arch" ]] ; then + $GZIP -3 $arch + $ECHO "llog archive was created by $GZIP" + fi +} + +if [ -z $OPT_MOUNT ] ; then + echo "Mount is not specified, exiting" + exit 1 +fi +remove_updatelog + +