From 3f13f89e2f19b46a8f27ad007c10251147984875 Mon Sep 17 00:00:00 2001 From: Mikhail Pershin Date: Mon, 18 Mar 2024 18:37:02 +0300 Subject: [PATCH] LU-17649 ptlrpc: fix -EACCES connection error handling Connection errors -EACCES and -EROFS leave import in intermediate state. It is still active as well as pinger over it but has obd_no_recov set. That allows import to recover after all if server security is updated. But even in FULL state any RPC over import gets -ESHUTDOWN as obd_no_recov is set Meanwhile obd_no_recov is not supposed to be used in that way, it reflects particular mount option and should not be recovered ever. So patch sets import to deactive state instead, making import not operational too but with option to be activated manually or remounted Server connections like LWP, MDT-OST and MDT-MDT are excluded and are never deactivated. Such errors are considered as temporary until remote target updates own security as required or administrative intervention will restart target as needed. In both cases console message is issued. Signed-off-by: Mikhail Pershin Change-Id: Ib83e1b0ac541823ec236591f08145340d6f6bf04 Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/54448 Reviewed-by: Oleg Drokin Reviewed-by: Sebastien Buisson Reviewed-by: Aurelien Degremont Tested-by: jenkins Tested-by: Maloo --- lustre/include/obd_support.h | 1 + lustre/ptlrpc/import.c | 41 ++++++++++++++++++++++++++++++++++------- lustre/target/tgt_handler.c | 3 +++ lustre/tests/recovery-small.sh | 41 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 79 insertions(+), 7 deletions(-) diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 816c2f7..623215c 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -774,6 +774,7 @@ extern bool obd_enable_health_write; /* continuation of MDS related constants */ #define OBD_FAIL_MDS_PAUSE_CREATE_AFTER_LOOKUP 0x2401 +#define OBD_FAIL_MDS_CONNECT_ACCESS 0x2402 /* PLEASE, KEEP NUMBERS UP TO 0x3000 RESERVED FOR OBD_FAIL_MDS_* */ diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index 496458c..02a8adc 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -1350,13 +1350,29 @@ out: import_set_state_nolock(imp, LUSTRE_IMP_DISCON); if (rc == -EACCES || rc == -EROFS) { /* - * Give up trying to reconnect - * EACCES means client has no permission for connection - * EROFS means client must mount read-only + * -EACCES means client has no permission for connection + * -EROFS means client must mount read-only + * Client deactivates import, it can be activated back + * manually when issue is resolved + * Server keeps trying forever until reconfigured or + * unmounted */ - imp->imp_obd->obd_no_recov = 1; - ptlrpc_deactivate_import_nolock(imp); - inact = true; + LCONSOLE_WARN("%s: connection denied by %s: rc = %d\n", + imp->imp_obd->obd_name, + obd2cli_tgt(imp->imp_obd), rc); + if (imp->imp_connect_flags_orig & + (OBD_CONNECT_LIGHTWEIGHT | + OBD_CONNECT_MDS_MDS | OBD_CONNECT_MDS)) { + /* consider LWP, MDT-MDT and MDT-OST access + * errors as temporary + */ + rc = -EAGAIN; + imp->imp_force_reconnect = 1; + } else { + imp->imp_deactive = 1; + ptlrpc_deactivate_import_nolock(imp); + inact = true; + } } else if (rc == -EPROTO) { struct obd_connect_data *ocd; @@ -1383,6 +1399,7 @@ out: OBD_OCD_VERSION_PATCH(ocd->ocd_version), OBD_OCD_VERSION_FIX(ocd->ocd_version), LUSTRE_VERSION_STRING); + imp->imp_deactive = 1; ptlrpc_deactivate_import_nolock(imp); import_set_state_nolock(imp, LUSTRE_IMP_CLOSED); inact = true; @@ -1411,8 +1428,18 @@ out: (request->rq_deadline - request->rq_sent); spin_unlock(&imp->imp_lock); - if (inact) + if (inact) { + /* imp_deactive event */ + obd_import_event(imp->imp_obd, imp, + IMP_EVENT_DEACTIVATE); + /* imp_invalid event */ obd_import_event(imp->imp_obd, imp, IMP_EVENT_INACTIVE); + /* evict and invalidate if reconnect */ + if (!aa->pcaa_initial_connect) { + import_set_state(imp, LUSTRE_IMP_EVICTED); + ptlrpc_import_recovery_state_machine(imp); + } + } if (rc == -EPROTO) RETURN(rc); diff --git a/lustre/target/tgt_handler.c b/lustre/target/tgt_handler.c index a970d04..1d5cb42 100644 --- a/lustre/target/tgt_handler.c +++ b/lustre/target/tgt_handler.c @@ -1061,6 +1061,9 @@ int tgt_connect(struct tgt_session_info *tsi) LUSTRE_MDT_NAME) == 0) { struct lu_nodemap *nm = NULL; + if (CFS_FAIL_CHECK(OBD_FAIL_MDS_CONNECT_ACCESS)) + GOTO(out, rc = -EACCES); + rc = req_check_sepol(tsi->tsi_pill); if (rc) GOTO(out, rc); diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index 41859b9..37d70b1 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3633,6 +3633,47 @@ test_157() } run_test 157 "eviction during mmaped i/o" +cleanup_158() { + do_facet mds2 $LCTL set_param fail_loc=0 + zconf_umount_clients $CLIENTS $MOUNT + mountcli +} + +test_158a() { + (( $MDS1_VERSION >= $(version_code 2.15.62) )) || + skip "Need MDS version at least 2.15.62 for -EACCES fix" + (( MDSCOUNT > 1 )) || skip "needs >= 2 MDTS" + + remount_client $MOUNT + # ensure there is no MOUNT2 on clients + zconf_umount_clients $CLIENTS $MOUNT2 -f + + stack_trap cleanup_158 EXIT RETURN + + # client import is evicted after failover followed by -EACCES + #define OBD_FAIL_MDS_CONNECT_ACCESS 0x2402 + do_facet mds2 $LCTL set_param fail_loc=0x2402 + fail_nodf mds2 + sleep 5 + $LFS df $MOUNT + do_facet mds2 $LCTL set_param fail_loc=0 + wait_recovery_complete mds2 || error "MDS recovery not done" + sleep 5 + + local mdc2=$($LCTL dl | awk '/mdc.*MDT0001-mdc*/ { print $4}') + local active=$($LCTL get_param -n mdc.$mdc2.active) + (( active == 0 )) || error "import status is 'active'" + $LCTL --device $mdc2 activate + active=$($LCTL get_param -n mdc.$mdc2.active) + (( active == 1 )) || error "import status is not 'active'" + status=$($LFS check mdts | grep $mdc2 | grep -c active) + (( status == 1 )) || { + $LFS check mdts + error "import is not operational" + } +} +run_test 158a "connect without access right" + complete_test $SECONDS check_and_cleanup_lustre exit_status -- 1.8.3.1