From 67acf6047e343a0e35f077c6aed4483a14d2015c Mon Sep 17 00:00:00 2001 From: Sebastien Buisson Date: Thu, 7 Dec 2023 18:07:09 +0100 Subject: [PATCH] LU-17317 gss: do not continue using expired reverse context In case a server uses an expired gss context to send a callback request to a client, it might be that the associated context on the client has already expired, and been purged from the cache. This results in a GSS_S_NO_CONTEXT reply. In this specific scenario, the server must mark its reverse context as dead. This will lead to destruction of the expired context, and creation of a new context suitable for further callback requests. Test-Parameters: kerberos=true testlist=sanity-krb5 Signed-off-by: Sebastien Buisson Change-Id: I4af90cd70a3815851ec555ea85b49714c8da4202 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/53375 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Aurelien Degremont Reviewed-by: Oleg Drokin --- lustre/ptlrpc/gss/gss_keyring.c | 32 ++++++----- lustre/ptlrpc/gss/sec_gss.c | 120 +++++++++++++++++++++------------------- 2 files changed, 82 insertions(+), 70 deletions(-) diff --git a/lustre/ptlrpc/gss/gss_keyring.c b/lustre/ptlrpc/gss/gss_keyring.c index 5dfb73f..041b06f 100644 --- a/lustre/ptlrpc/gss/gss_keyring.c +++ b/lustre/ptlrpc/gss/gss_keyring.c @@ -480,26 +480,32 @@ struct ptlrpc_cli_ctx * sec_lookup_root_ctx_kr(struct ptlrpc_sec *sec) { struct gss_sec_keyring *gsec_kr = sec2gsec_keyring(sec); struct ptlrpc_cli_ctx *ctx = NULL; + time64_t now = ktime_get_real_seconds(); spin_lock(&sec->ps_lock); - ctx = gsec_kr->gsk_root_ctx; + ctx = gsec_kr->gsk_root_ctx; - if (ctx == NULL && unlikely(sec_is_reverse(sec))) { + if (ctx == NULL && unlikely(sec_is_reverse(sec))) { struct ptlrpc_cli_ctx *tmp; - /* reverse ctx, search root ctx in list, choose the one - * with shortest expire time, which is most possibly have - * an established peer ctx at client side. */ + /* For reverse context, browse list and pick the one with + * shortest expire time and that has not expired yet. + * This one is most likely to have an established peer context + * on client side. + */ hlist_for_each_entry(tmp, &gsec_kr->gsk_clist, cc_cache) { - if (ctx == NULL || ctx->cc_expire == 0 || - ctx->cc_expire > tmp->cc_expire) { - ctx = tmp; - /* promote to be root_ctx */ - gsec_kr->gsk_root_ctx = ctx; - } - } - } + if (ctx == NULL || ctx->cc_expire == 0 || + (tmp->cc_expire > now && + tmp->cc_expire < ctx->cc_expire) || + (ctx->cc_expire < now && + tmp->cc_expire > ctx->cc_expire)) { + ctx = tmp; + /* promote to be root_ctx */ + gsec_kr->gsk_root_ctx = ctx; + } + } + } if (ctx) { LASSERT(atomic_read(&ctx->cc_refcount) > 0); diff --git a/lustre/ptlrpc/gss/sec_gss.c b/lustre/ptlrpc/gss/sec_gss.c index 392b247..28cea38d 100644 --- a/lustre/ptlrpc/gss/sec_gss.c +++ b/lustre/ptlrpc/gss/sec_gss.c @@ -665,69 +665,75 @@ redo: RETURN(0); } -static -int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx, - struct ptlrpc_request *req, - struct gss_header *ghdr) -{ - struct gss_err_header *errhdr; - int rc; - - LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR); - - errhdr = (struct gss_err_header *) ghdr; +static int gss_cli_ctx_handle_err_notify(struct ptlrpc_cli_ctx *ctx, + struct ptlrpc_request *req, + struct gss_header *ghdr) +{ + struct gss_err_header *errhdr; + int rc; + + LASSERT(ghdr->gh_proc == PTLRPC_GSS_PROC_ERR); + + errhdr = (struct gss_err_header *) ghdr; + + CWARN("%s: req x%llu/t%llu, ctx %p idx %#llx(%u->%s): %sserver respond (%08x/%08x)\n", + ctx->cc_sec->ps_import->imp_obd->obd_name, + req->rq_xid, req->rq_transno, ctx, + gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle), + ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), + sec_is_reverse(ctx->cc_sec) ? "reverse " : "", + errhdr->gh_major, errhdr->gh_minor); + + /* context fini rpc, let it failed */ + if (req->rq_ctx_fini) { + CWARN("%s: context fini rpc failed: rc = %d\n", + ctx->cc_sec->ps_import->imp_obd->obd_name, -EINVAL); + return -EINVAL; + } - CWARN("req x%llu/t%llu, ctx %p idx %#llx(%u->%s): " - "%sserver respond (%08x/%08x)\n", - req->rq_xid, req->rq_transno, ctx, - gss_handle_to_u64(&ctx2gctx(ctx)->gc_handle), - ctx->cc_vcred.vc_uid, sec2target_str(ctx->cc_sec), - sec_is_reverse(ctx->cc_sec) ? "reverse" : "", - errhdr->gh_major, errhdr->gh_minor); + /* reverse sec, just return error, don't expire this ctx because it's + * crucial to callback rpcs. note if the callback rpc failed because + * of bit flip during network transfer, the client will be evicted + * directly. so more gracefully we probably want let it retry for + * number of times. + */ + if (sec_is_reverse(ctx->cc_sec) && + errhdr->gh_major != GSS_S_NO_CONTEXT) + return -EINVAL; - /* context fini rpc, let it failed */ - if (req->rq_ctx_fini) { - CWARN("context fini rpc failed\n"); - return -EINVAL; - } + if (errhdr->gh_major != GSS_S_NO_CONTEXT && + errhdr->gh_major != GSS_S_BAD_SIG) + return -EACCES; - /* reverse sec, just return error, don't expire this ctx because it's - * crucial to callback rpcs. note if the callback rpc failed because - * of bit flip during network transfer, the client will be evicted - * directly. so more gracefully we probably want let it retry for - * number of times. */ - if (sec_is_reverse(ctx->cc_sec)) - return -EINVAL; + /* server return NO_CONTEXT might be caused by context expire + * or server reboot/failover. we try to refresh a new ctx which + * be transparent to upper layer. + * + * In some cases, our gss handle is possible to be incidentally + * identical to another handle since the handle itself is not + * fully random. In krb5 case, the GSS_S_BAD_SIG will be + * returned, maybe other gss error for other mechanism. + * + * if we add new mechanism, make sure the correct error are + * returned in this case. + */ + CWARN("%s: %s might have lost the context (%s), retrying\n", + ctx->cc_sec->ps_import->imp_obd->obd_name, + sec_is_reverse(ctx->cc_sec) ? "client" : "server", + errhdr->gh_major == GSS_S_NO_CONTEXT ? "NO_CONTEXT" : "BAD_SIG"); - if (errhdr->gh_major != GSS_S_NO_CONTEXT && - errhdr->gh_major != GSS_S_BAD_SIG) - return -EACCES; + sptlrpc_cli_ctx_expire(ctx); - /* server return NO_CONTEXT might be caused by context expire - * or server reboot/failover. we try to refresh a new ctx which - * be transparent to upper layer. - * - * In some cases, our gss handle is possible to be incidentally - * identical to another handle since the handle itself is not - * fully random. In krb5 case, the GSS_S_BAD_SIG will be - * returned, maybe other gss error for other mechanism. - * - * if we add new mechanism, make sure the correct error are - * returned in this case. */ - CWARN("%s: server might lost the context, retrying\n", - errhdr->gh_major == GSS_S_NO_CONTEXT ? "NO_CONTEXT" : "BAD_SIG"); - - sptlrpc_cli_ctx_expire(ctx); - - /* we need replace the ctx right here, otherwise during - * resent we'll hit the logic in sptlrpc_req_refresh_ctx() - * which keep the ctx with RESEND flag, thus we'll never - * get rid of this ctx. */ - rc = sptlrpc_req_replace_dead_ctx(req); - if (rc == 0) - req->rq_resend = 1; + /* we need replace the ctx right here, otherwise during + * resent we'll hit the logic in sptlrpc_req_refresh_ctx() + * which keep the ctx with RESEND flag, thus we'll never + * get rid of this ctx. + */ + rc = sptlrpc_req_replace_dead_ctx(req); + if (rc == 0) + req->rq_resend = 1; - return rc; + return rc; } int gss_cli_ctx_verify(struct ptlrpc_cli_ctx *ctx, -- 1.8.3.1