From 3d635dd3f24421c181aca5673cd81ed8f3e2c622 Mon Sep 17 00:00:00 2001 From: Sebastien Buisson Date: Thu, 8 Feb 2024 13:44:21 +0100 Subject: [PATCH] LU-17484 gss: reply error for SEC_CTX_INIT on wrong node When a server receives a SEC_CTX_INIT request for a target that is not available (either stopping, or not set up yet, or moved to a failover node), the request gets dropped. This makes the client-side RPC time out, increasing the time it takes to establish a proper gss context with the target, because it slows down the HA mechanism that tries alternate failover NIDs. Instead of dropping the request reply for SEC_CTX_INIT, the server needs to send back a proper error reply. The client will then be able to immediately try alternate failover NIDs, speeding mount/reconnect process up, and avoiding potential eviction. Test-Parameters: trivial Test-Parameters: kerberos=true testlist=sanity-krb5 Test-Parameters: testgroup=review-dne-selinux-ssk-part-2 Signed-off-by: Sebastien Buisson Change-Id: Id2cefaa7d54729a63c7be13b65d7ace579bcaa78 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53970 Reviewed-by: Aurelien Degremont Reviewed-by: Andreas Dilger Reviewed-by: Oleg Drokin Tested-by: jenkins Tested-by: Maloo --- lustre/ldlm/ldlm_lib.c | 5 ++- lustre/ptlrpc/gss/lproc_gss.c | 7 +++ lustre/ptlrpc/gss/sec_gss.c | 99 +++++++++++++++++++++++++++-------------- lustre/ptlrpc/import.c | 1 + lustre/utils/gss/svcgssd_proc.c | 4 +- 5 files changed, 79 insertions(+), 37 deletions(-) diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index c0793b5..b2dd5d5 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -1115,8 +1115,9 @@ int target_handle_connect(struct ptlrpc_request *req) if (!target) { deuuidify(str, NULL, &target_start, &target_len); LCONSOLE_ERROR_MSG(0x137, - "%s: not available for connect from %s (no target). If you are running an HA pair check that the target is mounted on the other server.\n", - str, libcfs_nidstr(&req->rq_peer.nid)); + "%.*s: not available for connect from %s (no target). If you are running an HA pair check that the target is mounted on the other server.\n", + target_len, target_start, + libcfs_nidstr(&req->rq_peer.nid)); GOTO(out, rc = -ENODEV); } diff --git a/lustre/ptlrpc/gss/lproc_gss.c b/lustre/ptlrpc/gss/lproc_gss.c index 1fe6b0d..593b5d1 100644 --- a/lustre/ptlrpc/gss/lproc_gss.c +++ b/lustre/ptlrpc/gss/lproc_gss.c @@ -271,6 +271,13 @@ again: rc = upcall_cache_downcall(rsicache, param->sid_err, param->sid_hash, param); + /* The caller, i.e. the userspace process writing to rsi_info, only + * needs to know about invalid values. Other errors are processed + * directly in the kernel. + */ + if (rc != -EINVAL) + rc = 0; + out: if (param != NULL) OBD_FREE_LARGE(param, size); diff --git a/lustre/ptlrpc/gss/sec_gss.c b/lustre/ptlrpc/gss/sec_gss.c index 5c88883..0c302fb 100644 --- a/lustre/ptlrpc/gss/sec_gss.c +++ b/lustre/ptlrpc/gss/sec_gss.c @@ -1938,35 +1938,39 @@ int gss_pack_err_notify(struct ptlrpc_request *req, __u32 major, __u32 minor) static int gss_svc_handle_init(struct ptlrpc_request *req, struct gss_wire_ctx *gw) { - struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); - struct lustre_msg *reqbuf = req->rq_reqbuf; - struct obd_uuid *uuid; - struct obd_device *target; - rawobj_t uuid_obj, rvs_hdl, in_token; - __u32 lustre_svc; - __u32 *secdata, seclen; - int swabbed, rc; - ENTRY; + struct gss_svc_reqctx *grctx = gss_svc_ctx2reqctx(req->rq_svc_ctx); + struct lustre_msg *reqbuf = req->rq_reqbuf; + struct obd_uuid *uuid; + struct obd_device *target; + rawobj_t uuid_obj, rvs_hdl, in_token; + __u32 lustre_svc; + __u32 *secdata, seclen; + int swabbed, rc; + ENTRY; CDEBUG(D_SEC, "processing gss init(%d) request from %s\n", gw->gw_proc, libcfs_nidstr(&req->rq_peer.nid)); req->rq_ctx_init = 1; if (gw->gw_flags & LUSTRE_GSS_PACK_BULK) { - CERROR("unexpected bulk flag\n"); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "unexpected bulk flag: rc = %d\n", rc); + RETURN(rc); } if (gw->gw_proc == PTLRPC_GSS_PROC_INIT && gw->gw_handle.len != 0) { - CERROR("proc %u: invalid handle length %u\n", - gw->gw_proc, gw->gw_handle.len); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "proc %u: invalid handle length %u: rc = %d\n", + gw->gw_proc, gw->gw_handle.len, rc); + RETURN(rc); } if (reqbuf->lm_bufcount < 3 || reqbuf->lm_bufcount > 4) { - CERROR("Invalid bufcount %d\n", reqbuf->lm_bufcount); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "Invalid bufcount %d: rc = %d\n", + reqbuf->lm_bufcount, rc); + RETURN(rc); } swabbed = req_capsule_req_need_swab(&req->rq_pill); @@ -1976,8 +1980,9 @@ int gss_svc_handle_init(struct ptlrpc_request *req, struct gss_wire_ctx *gw) seclen = reqbuf->lm_buflens[reqbuf->lm_bufcount - 1]; if (seclen < 4 + 4) { - CERROR("sec size %d too small\n", seclen); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "sec size %d too small: rc = %d\n", seclen, rc); + RETURN(rc); } /* lustre svc type */ @@ -1988,30 +1993,48 @@ int gss_svc_handle_init(struct ptlrpc_request *req, struct gss_wire_ctx *gw) * because touched internal structure of obd_uuid */ if (rawobj_extract(&uuid_obj, &secdata, &seclen)) { - CERROR("failed to extract target uuid\n"); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "failed to extract target uuid: rc = %d\n", rc); + RETURN(rc); } uuid_obj.data[uuid_obj.len - 1] = '\0'; uuid = (struct obd_uuid *) uuid_obj.data; target = class_uuid2obd(uuid); if (!target || target->obd_stopping || !target->obd_set_up) { - CERROR("target '%s' is not available for context init (%s)\n", - uuid->uuid, target == NULL ? "no target" : - (target->obd_stopping ? "stopping" : "not set up")); - RETURN(SECSVC_DROP); + char *target_start; + int target_len; + + if (gss_pack_err_notify(req, GSS_S_NO_CONTEXT, 0) == 0) + rc = SECSVC_COMPLETE; + else + rc = SECSVC_DROP; + + deuuidify(uuid->uuid, NULL, &target_start, &target_len); + LCONSOLE_ERROR("%.*s: not available for GSS context init from %s (%s).\n", + target_len, target_start, + libcfs_nidstr(&req->rq_peer.nid), + target ? + (target->obd_stopping ? + "stopping" : "not set up") : + "no target"); + RETURN(rc); } /* extract reverse handle */ if (rawobj_extract(&rvs_hdl, &secdata, &seclen)) { - CERROR("failed extract reverse handle\n"); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "%s: failed extract reverse handle: rc = %d\n", + target->obd_name, rc); + RETURN(rc); } /* extract token */ if (rawobj_extract(&in_token, &secdata, &seclen)) { - CERROR("can't extract token\n"); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "%s: can't extract token: rc = %d\n", + target->obd_name, rc); + RETURN(rc); } rc = gss_svc_upcall_handle_init(req, grctx, gw, target, lustre_svc, @@ -2021,24 +2044,32 @@ int gss_svc_handle_init(struct ptlrpc_request *req, struct gss_wire_ctx *gw) if (grctx->src_ctx->gsc_usr_mds || grctx->src_ctx->gsc_usr_oss || grctx->src_ctx->gsc_usr_root) - CDEBUG(D_SEC, "create svc ctx %p: user from %s authenticated as %s\n", + CDEBUG(D_SEC, + "%s: create svc ctx %p: user from %s authenticated as %s\n", + target->obd_name, grctx->src_ctx, libcfs_nidstr(&req->rq_peer.nid), grctx->src_ctx->gsc_usr_root ? "root" : (grctx->src_ctx->gsc_usr_mds ? "mds" : (grctx->src_ctx->gsc_usr_oss ? "oss" : "null"))); else - CDEBUG(D_SEC, "create svc ctx %p: accept user %u from %s\n", + CDEBUG(D_SEC, "%s: create svc ctx %p: accept user %u from %s\n", + target->obd_name, grctx->src_ctx, grctx->src_ctx->gsc_uid, libcfs_nidstr(&req->rq_peer.nid)); if (gw->gw_flags & LUSTRE_GSS_PACK_USER) { if (reqbuf->lm_bufcount < 4) { - CERROR("missing user descriptor\n"); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, "%s: missing user descriptor: rc = %d\n", + target->obd_name, rc); + RETURN(rc); } if (sptlrpc_unpack_user_desc(reqbuf, 2, swabbed)) { - CERROR("Mal-formed user descriptor\n"); - RETURN(SECSVC_DROP); + rc = SECSVC_DROP; + CDEBUG(D_SEC, + "%s: Mal-formed user descriptor: rc = %d\n", + target->obd_name, rc); + RETURN(rc); } req->rq_pack_udesc = 1; diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index cb27a1e..c8432d1 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -148,6 +148,7 @@ void deuuidify(char *uuid, const char *prefix, char **uuid_start, int *uuid_len) UUID_STR, strlen(UUID_STR))) *uuid_len -= strlen(UUID_STR); } +EXPORT_SYMBOL(deuuidify); /* Must be called with imp_lock held! */ static void ptlrpc_deactivate_import_nolock(struct obd_import *imp) diff --git a/lustre/utils/gss/svcgssd_proc.c b/lustre/utils/gss/svcgssd_proc.c index 3e6f001..517683e 100644 --- a/lustre/utils/gss/svcgssd_proc.c +++ b/lustre/utils/gss/svcgssd_proc.c @@ -250,8 +250,9 @@ static int send_response(int auth_res, __u64 hash, rc = -errno; printerr(LL_ERR, "ERROR: %s failed: %s\n", __func__, strerror(-rc)); + } else { + printerr(LL_DEBUG, "response written ok\n"); } - printerr(LL_DEBUG, "response written ok\n"); close(fd); out_path: @@ -1069,6 +1070,7 @@ int handle_channel_request(int fd) lustre_mech); out_err: + printerr(LL_INFO, "to send response with rc=%d\n", rc ? -EACCES : 0); /* Failures send a null token */ rc = send_response(rc, hash, &snd.in_handle, &snd.in_tok, snd.maj_stat, snd.min_stat, -- 1.8.3.1