From: Sergey Cheremencev Date: Mon, 15 Apr 2024 20:35:56 +0000 (+0300) Subject: LU-17770 quota: don't panic in qmt_map_lge_idx X-Git-Tag: 2.16.51~167 X-Git-Url: https://git.whamcloud.com/gitweb?a=commitdiff_plain;h=1f9689d0f9;p=fs%2Flustre-release.git LU-17770 quota: don't panic in qmt_map_lge_idx There is a valid case when it is impossible to map OST index into an appropriate index of lqe global array(lqe_gblb_array). This might happen when newly added OSTs haven't connected yet to QMT and there is no corresponding index files in quota_master/dt-0x0 directory. At the same time if these OSTs already exist in OST pools, this might cause following panic: qmt_map_lge_idx()) ASSERTION( k < lgd->lqeg_num_used ) failed: Cannot map ostidx 32 for 000000000505fcbe qmt_map_lge_idx()) LBUG ... Call Trace TBD: libcfs_call_trace+0x6f/0xa0 [libcfs] lbug_with_loc+0x3f/0x70 [libcfs] qmt_map_lge_idx+0x7f/0x90 [lquota] qmt_seed_glbe_all+0x17f/0x770 [lquota] qmt_revalidate_lqes+0x213/0x360 [lquota] qmt_dqacq0+0x7d5/0x2320 [lquota] qmt_intent_policy+0x8d2/0xf10 [lquota] mdt_intent_opc+0x9a9/0xa80 [mdt] mdt_intent_policy+0x1fd/0x390 [mdt] ldlm_lock_enqueue+0x469/0xa90 [ptlrpc] ldlm_handle_enqueue0+0x61a/0x16c0 [ptlrpc] tgt_enqueue+0xa4/0x200 [ptlrpc] tgt_request_handle+0xc9c/0x1950 [ptlrpc] ptlrpc_server_handle_request+0x323/0xbd0 [ptlrpc] ptlrpc_main+0xbf1/0x1510 [ptlrpc] kthread+0x134/0x150 ret_from_fork+0x1f/0x40 Kernel panic - not syncing: LBUG Add sanity-quota_91. It removes and creates quota slave index files in quota_master/dt-0x0 to simulate adding new OSTs in a system. Signed-off-by: Sergey Cheremencev Change-Id: I747366af736d408a8965812b48660cca1367becb Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/55476 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger Reviewed-by: Alexander Boyko Reviewed-by: Oleg Drokin --- diff --git a/lustre/quota/qmt_entry.c b/lustre/quota/qmt_entry.c index d67994e..4a45570 100644 --- a/lustre/quota/qmt_entry.c +++ b/lustre/quota/qmt_entry.c @@ -882,6 +882,7 @@ bool qmt_adjust_edquot_qunit_notify(const struct lu_env *env, } else if (idx >= 0) { int lge_idx = qmt_map_lge_idx(lgd, idx); + LASSERT(lge_idx >= 0); /* If there are no locks yet when * lge_qunit/edquot_nu is set, slaves * are still not notified with new @@ -1149,6 +1150,8 @@ int qmt_map_lge_idx(struct lqe_glbl_data *lgd, int ostidx) { int k; + CDEBUG(D_QUOTA, "mapping ostidx %d num_used %d\n", ostidx, + lgd->lqeg_num_used); /* check common case of sequential OST numbers first */ if (ostidx < lgd->lqeg_num_used && lgd->lqeg_arr[ostidx].lge_idx == ostidx) @@ -1158,8 +1161,12 @@ int qmt_map_lge_idx(struct lqe_glbl_data *lgd, int ostidx) if (lgd->lqeg_arr[k].lge_idx == ostidx) break; - LASSERTF(k < lgd->lqeg_num_used, "Cannot map ostidx %d for %p\n", - ostidx, lgd); + if (k >= lgd->lqeg_num_used) { + CERROR("qmt: cannot map ostidx %d, num_used %d: rc = %d\n", + ostidx, lgd->lqeg_num_used, -EINVAL); + return -EINVAL; + } + return k; } @@ -1211,6 +1218,17 @@ void qmt_seed_glbe_all(const struct lu_env *env, struct lqe_glbl_data *lgd, tgt_idx = qmt_sarr_get_idx(qpi, j); LASSERT(tgt_idx >= 0); idx = qmt_map_lge_idx(lgd, tgt_idx); + /* ENOENT is fine here - it is possible when + * quota_master/dt-0x0 hasn't got indexes + * files for all OSTs yet. At the same time + * Quota Pool may include all OSTs just from + * configuration despite they haven't connected + * yet. + */ + if (idx < 0 && !lqe->lqe_is_global) + continue; + LASSERTF(idx >= 0, "idx %d lqe_is_global %d lqe %px\n", + idx, lqe->lqe_is_global, lqe); if (edquot) { int lge_edquot, new_edquot, edquot_nu; diff --git a/lustre/quota/qmt_lock.c b/lustre/quota/qmt_lock.c index 139509d..081b82c 100644 --- a/lustre/quota/qmt_lock.c +++ b/lustre/quota/qmt_lock.c @@ -270,6 +270,8 @@ static bool qmt_clear_lgeg_arr_nu(struct lquota_entry *lqe, int stype, int idx) if (lgd) { int lge_idx = qmt_map_lge_idx(lgd, idx); + if (lge_idx < 0) + return false; lgd->lqeg_arr[lge_idx].lge_qunit_nu = 0; lgd->lqeg_arr[lge_idx].lge_edquot_nu = 0; /* We shouldn't call revoke for DOM case, it will be @@ -299,6 +301,7 @@ static bool qmt_set_revoke(struct lu_env *env, struct lquota_entry *lqe_gl, int lge_idx; lge_idx = qmt_map_lge_idx(lgd, idx); + LASSERT(lge_idx >= 0); if (lgd->lqeg_arr[lge_idx].lge_qunit == least_qunit) { struct lquota_entry *lqe; int i; @@ -667,6 +670,7 @@ static void qmt_setup_id_desc(struct ldlm_lock *lock, union ldlm_gl_desc *desc, lgd = lqe->lqe_glbl_data; if (lgd) { lge_idx = qmt_map_lge_idx(lgd, idx); + LASSERT(lge_idx >= 0); edquot = lgd->lqeg_arr[lge_idx].lge_edquot; qunit = lgd->lqeg_arr[lge_idx].lge_qunit; } else { @@ -886,6 +890,7 @@ static int qmt_id_lock_cb(struct ldlm_lock *lock, struct lquota_entry *lqe) if (lgd) { int lge_idx = qmt_map_lge_idx(lgd, idx); + LASSERT(lge_idx >= 0); CDEBUG(D_QUOTA, "tgt idx:%d lge_idx:%d edquot_nu:%d qunit_nu:%d\n", idx, lge_idx, lgd->lqeg_arr[lge_idx].lge_edquot_nu, diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index c374d4c..1585a20 100755 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -6476,6 +6476,67 @@ test_90b() } run_test 90b "lfs quota should work with multiple mount points" +test_91() +{ + (( OSTCOUNT >= 2 )) || skip_env "needs >= 2 OSTs" + local mds_dev=$(mdsdevname 1) + local ost1_dev=$(ostdevname 1) + local ost2_dev=$(ostdevname 2) + local ost0_idx="quota_master/dt-0x0/0x20000-OST0000_UUID" + local ost1_idx="quota_master/dt-0x0/0x20000-OST0001_UUID" + local tstid=$(id -u $TSTUSR) + + formatall + if ! combined_mgs_mds ; then + start_mgs + fi + start mds1 $mds_dev $MDS_MOUNT_OPTS || error "Cannot start mds1" + wait_clients_import_state ${CLIENTS:-$HOSTNAME} mds1 FULL + + echo "start ost1 service on `facet_active_host ost1`" + start ost1 $ost1_dev $OST_MOUNT_OPTS || error "Cannot start ost1" + wait_clients_import_ready ${CLIENTS:-$HOSTNAME} ost1 + echo "start ost2 service on `facet_active_host ost2`" + start ost2 $ost2_dev $OST_MOUNT_OPTS || error "Cannot start ost2" + wait_clients_import_ready ${CLIENTS:-$HOSTNAME} ost2 + echo "start client" + zconf_mount $HOSTNAME $MOUNT || error "mount client failed" + + if [[ $PERM_CMD == *"set_param -P"* ]]; then + do_facet mgs $PERM_CMD \ + set_param -P osd-*.*.quota_slave.enabled=u + else + do_facet mgs $PERM_CMD $FSNAME.quota.ost=u || + error "set ost quota type failed" + fi + + pool_add qpool1 1 + pool_add_targets qpool1 0 1 1 1 + $LFS setquota -u $TSTUSR -B50M $DIR || error "can't set quota" + wait_quota_synced ost1 OST0000 usr $tstid hardlimit $((50*1024)) + wait_quota_synced ost2 OST0001 usr $tstid hardlimit $((50*1024)) + echo "stop mds1" + stop mds1 -f || error "Can't stop mds1" + + do_facet mds1 "$DEBUGFS -w -R 'rm $ost0_idx' $mds_dev" || + error "removing $ost0_idx error" + do_facet mds1 "$DEBUGFS -w -R 'rm $ost1_idx' $mds_dev" || + error "removing $ost1_idx error" + do_facet mds1 "$DEBUGFS -c -R 'ls -l quota_master/dt-0x0/' $mds_dev" + + echo "start mds1" + start mds1 $mds_dev $MDS_MOUNT_OPTS || error "Cannot start mds1" + wait_clients_import_state ${CLIENTS:-$HOSTNAME} mds1 FULL + + mkdir $DIR/$tdir || error "mkdir failed" + chmod 0777 $DIR/$tdir || error "chmod error" + $RUNAS $DD of=$DIR/$tdir/f1 bs=1M count=50 + + stopall + formatall + setupall +} +run_test 91 "new quota index files in quota_master" quota_fini() { diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 50a39b4..b53df9b 100755 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -9055,14 +9055,17 @@ create_pool() { local fsname=${1%%.*} local poolname=${1##$fsname.} local keep_pools=${2:-false} + local mdscount=${3:-$MDSCOUNT} + # can't pass an empty argument to destroy_test_pools + local dtp_fsname=${fsname:-$FSNAME} - stack_trap "destroy_test_pools $fsname" EXIT + stack_trap "destroy_test_pools $dtp_fsname $mdscount" EXIT do_facet mgs lctl pool_new $1 local RC=$? # get param should return err unless pool is created [[ $RC -ne 0 ]] && return $RC - for mds_id in $(seq $MDSCOUNT); do + for ((mds_id = 1; mds_id < $mdscount; mds_id++)); do local mdt_id=$((mds_id-1)) local lodname=$fsname-MDT$(printf "%04x" $mdt_id)-mdtlov wait_update_facet mds$mds_id \ @@ -9118,6 +9121,7 @@ destroy_pool_int() { destroy_pool() { local fsname=${1%%.*} local poolname=${1##$fsname.} + local mdscount=${2:-$MDSCOUNT} [[ x$fsname = x$poolname ]] && fsname=$FSNAME @@ -9128,7 +9132,7 @@ destroy_pool() { destroy_pool_int $fsname.$poolname RC=$? [[ $RC -ne 0 ]] && return $RC - for mds_id in $(seq $MDSCOUNT); do + for ((mds_id = 1; mds_id < $mdscount; mds_id++)); do local mdt_id=$((mds_id-1)) local lodname=$fsname-MDT$(printf "%04x" $mdt_id)-mdtlov wait_update_facet mds$mds_id \ @@ -9146,6 +9150,7 @@ destroy_pool() { destroy_pools () { local fsname=${1:-$FSNAME} + local mdscount=${2:-$MDSCOUNT} local poolname local listvar=${fsname}_CREATED_POOLS @@ -9153,13 +9158,15 @@ destroy_pools () { echo "Destroy the created pools: ${!listvar}" for poolname in ${!listvar//,/ }; do - destroy_pool $fsname.$poolname + destroy_pool $fsname.$poolname $mdscount done } destroy_test_pools () { local fsname=${1:-$FSNAME} - destroy_pools $fsname || true + local mdscount=${2:-$MDSCOUNT} + + destroy_pools $fsname $mdscount || true } gather_logs () { @@ -10300,10 +10307,11 @@ check_file_in_pool() } pool_add() { - echo "Creating new pool" local pool=$1 + local mdscount=${2:-$MDSCOUNT} - create_pool $FSNAME.$pool || + echo "Creating new pool $pool" + create_pool $FSNAME.$pool false $mdscount || { error_noexit "No pool created, result code $?"; return 1; } [ $($LFS pool_list $FSNAME | grep -c "$FSNAME.${pool}\$") -eq 1 ] || { error_noexit "$pool not in lfs pool_list"; return 2; } @@ -10315,6 +10323,7 @@ pool_add_targets() { local first=$2 local last=${3:-$first} local step=${4:-1} + local mdscount=${5:-$MDSCOUNT} local list=$(seq $first $step $last) @@ -10333,7 +10342,7 @@ pool_add_targets() { fi # wait for OSTs to be added to the pool - for mds_id in $(seq $MDSCOUNT); do + for ((mds_id = 1; mds_id < $mdscount; mds_id++)); do local mdt_id=$((mds_id-1)) local lodname=$FSNAME-MDT$(printf "%04x" $mdt_id)-mdtlov wait_update_facet mds$mds_id \