From 3c6071fde98bb6b5de7a9ce5cbec06e911bfe834 Mon Sep 17 00:00:00 2001 From: Sebastien Buisson Date: Wed, 15 Nov 2023 11:22:13 +0100 Subject: [PATCH] LU-17175 gss: start lsvcgssd from l_getauth If l_getauth detects it cannot connect to the socket supposed to be opened by lsvcgssd, it tries to launch the daemon, with predefined default values. Lustre-change: https://review.whamcloud.com/53142 Lustre-commit: 414467762f8a034c72903bab8ebfce6e1feb8e79 Test-Parameters: trivial Test-Parameters: kerberos=true testlist=sanity-krb5 Test-Parameters: testgroup=review-dne-selinux-ssk-part-2 Signed-off-by: Sebastien Buisson Change-Id: I3961ce0f548fb6ea23458edcb01a03fb8b3a617f Reviewed-by: Andreas Dilger Reviewed-on: https://review.whamcloud.com/c/ex/lustre-release/+/54369 Tested-by: jenkins Tested-by: Maloo --- lustre/conf/lsvcgss | 3 ++- lustre/scripts/lsvcgss | 5 ++++- lustre/tests/sanity-krb5.sh | 32 ++++++++++++++++++++++++++------ lustre/utils/gss/l_getauth.c | 44 ++++++++++++++++++++++++++++++++++++-------- 4 files changed, 68 insertions(+), 16 deletions(-) diff --git a/lustre/conf/lsvcgss b/lustre/conf/lsvcgss index 9f930e0..658a7c0 100644 --- a/lustre/conf/lsvcgss +++ b/lustre/conf/lsvcgss @@ -1,2 +1,3 @@ # Optional arguments passed to lsvcgssd. -LSVCGSSDARGS='' + +#LSVCGSSDARGS='' diff --git a/lustre/scripts/lsvcgss b/lustre/scripts/lsvcgss index 46fcfc9..0f294a2 100755 --- a/lustre/scripts/lsvcgss +++ b/lustre/scripts/lsvcgss @@ -10,9 +10,12 @@ . /etc/init.d/functions LOCKFILE="/var/lock/subsys/lsvcgssd" + +# If service is not configured, launch with all mechs # -k -- Enable kerberos support # -s -- Enable shared key support -LSVCGSSDARGS="-k -s" +# -z -- Enable gssnull support +LSVCGSSDARGS="-k -s -z" # Check for and source configuration file [ -f /etc/sysconfig/lsvcgss ] && . /etc/sysconfig/lsvcgss diff --git a/lustre/tests/sanity-krb5.sh b/lustre/tests/sanity-krb5.sh index 3746b4e..e008ad0 100755 --- a/lustre/tests/sanity-krb5.sh +++ b/lustre/tests/sanity-krb5.sh @@ -362,6 +362,7 @@ run_test 4 "lgssd dead, operations should wait timeout and fail" test_5() { local file1=$DIR/$tdir/$tfile-1 local file2=$DIR/$tdir/$tfile-2 + local file3=$DIR/$tdir/$tfile-3 local wait_time=$((TIMEOUT + TIMEOUT / 2)) mkdir $DIR/$tdir || error "mkdir $DIR/$tdir failed" @@ -372,25 +373,44 @@ test_5() { [ -f $file1 ] || error "$file1 not found" # flush context - $RUNAS $LFS flushctx $MOUNT || error "can't flush context" + $RUNAS $LFS flushctx $MOUNT || error "can't flush context (1)" # stop lsvcgssd send_sigint $(comma_list $(mdts_nodes)) $LSVCGSSD sleep 5 check_gss_daemon_nodes $(comma_list $(mdts_nodes)) $LSVCGSSD && - error "$LSVCGSSD still running" + error "$LSVCGSSD still running (1)" + + # daemon should restart automatically, at least on newer servers + $RUNAS touch $file2 + if [ $? -ne 0 ]; then + echo "$RUNAS touch $file2 failed" + (( MDS1_VERSION < $(version_code 2.15.61) )) || + error "$LSVCGSSD should restart automatically" + else + echo "$RUNAS touch $file2 succeeded" + fi + + # flush context + if (( MDS1_VERSION >= $(version_code 2.15.61) )); then + $RUNAS $LFS flushctx $MOUNT || error "can't flush context (2)" + fi - $RUNAS touch $file2 && error "should fail without $LSVCGSSD" + # stop lsvcgssd + send_sigint $(comma_list $(mdts_nodes)) $LSVCGSSD + sleep 5 + check_gss_daemon_nodes $(comma_list $(mdts_nodes)) $LSVCGSSD && + error "$LSVCGSSD still running (2)" # restart lsvcgssd, expect touch succeed echo "restart $LSVCGSSD and recovering" start_gss_daemons $(comma_list $(mdts_nodes)) $LSVCGSSD "-vvv" sleep 5 check_gss_daemon_nodes $(comma_list $(mdts_nodes)) $LSVCGSSD - $RUNAS touch $file2 || error "should not fail now" - [ -f $file2 ] || error "$file2 not found" + $RUNAS touch $file3 || error "should not fail now" + [ -f $file3 ] || error "$file3 not found" } -run_test 5 "lsvcgssd dead, operations fail" +run_test 5 "lsvcgssd dead, operations pass" test_6() { local nfile=10 diff --git a/lustre/utils/gss/l_getauth.c b/lustre/utils/gss/l_getauth.c index d1fecbe..f20be2e 100644 --- a/lustre/utils/gss/l_getauth.c +++ b/lustre/utils/gss/l_getauth.c @@ -13,6 +13,23 @@ #include "lsupport.h" #include "err_util.h" +static int start_daemon(char *auth_req) +{ + int rc; + + rc = system("/usr/bin/systemctl restart lsvcgss"); + if (rc < 0 || (errno = WEXITSTATUS(rc))) { + printerr(LL_ERR, "systemctl restart lsvcgss service failed: %s\n", + strerror(errno)); + rc = -errno; + } else { + printerr(LL_INFO, "lsvcgss service automatically restarted\n"); + rc = 0; + } + + return rc; +} + int main(int argc, char **argv) { int local_socket; @@ -20,7 +37,7 @@ int main(int argc, char **argv) ssize_t bytes_sent; char *auth_req = NULL, *cachename = NULL; ssize_t req_len; - int opt, debug = 0, rc = 0; + int opt, debug = 0, tried_daemon = 0, rc = 0; /* Parameters received from kernel (see rsi_do_upcall()): * -c -r -d @@ -44,7 +61,7 @@ int main(int argc, char **argv) break; case 'd': debug = 1; - goto connect; + goto socket; case 'r': auth_req = optarg; break; @@ -72,24 +89,35 @@ int main(int argc, char **argv) req_len = strlen(auth_req); -connect: +socket: /* Send auth request to lsvcgssd via a socket. */ local_socket = socket(AF_UNIX, SOCK_STREAM, 0); if (local_socket == -1) { - rc = -errno; - printerr(LL_ERR, "cannot create socket: %d\n", rc); - return rc; + printerr(LL_ERR, "cannot create socket: %d\n", -errno); + return EXIT_FAILURE; } memset(&addr, 0, sizeof(addr)); addr.sun_family = AF_UNIX; strncpy(addr.sun_path, GSS_SOCKET_PATH, sizeof(addr.sun_path) - 1); +connect: if (connect(local_socket, (struct sockaddr *)&addr, sizeof(addr)) == -1) { rc = -errno; printerr(LL_ERR, "cannot connect to socket: %d\n", rc); - goto out; + if (debug || tried_daemon == 5) + goto out; + if (!tried_daemon) { + rc = start_daemon(auth_req); + if (rc) { + rc = -1; + goto out; + } + sleep(2); + } + tried_daemon++; + goto connect; } if (debug) @@ -107,5 +135,5 @@ connect: out: close(local_socket); - return rc; + return rc < 0 ? EXIT_FAILURE : 0; } -- 1.8.3.1