From: Wang Shilong Date: Tue, 20 Jul 2021 02:36:31 +0000 (+0800) Subject: LU-14739 quota: fix quota with root squash enabled X-Git-Tag: 2.14.56~186 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=bbfdc7c1670c92747a8f98d39e1e43dc39e59e30 LU-14739 quota: fix quota with root squash enabled This patch tries to fix several problems: 1. OSD will ignore quota if IO comes from client cache or root, however since following change: LU-12687 osc: consume grants for direct I/O DIO now consumes grant too, following check for sync IO is wrong now: (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == OBD_BRW_FROM_GRANT) This wass originally added to support 1.8 client, it is going to be 2.15 now, so let's remove this broken check. 2. Server side will clear OBD_BRW_NOQUOTA if root squash is enabled, this will revert fixes from: "LU-13228 clio: mmap write when overquota" We need to separate @ci_noquota and @oi_cap_sys_resource cases, introduce a new flag OBD_BRW_SYS_RESOURCE, and extend test_75 to cover this case. 3. LU-14739 missed case that DoM quota should be considered as well. 4. If EDQUOT is returned for root, we check the new root squash flag OBD_FL_ROOT_SQUASH from server side. If this flag is not set, we bypass quota for root, otherwise all root writes become sync writes. 5. Fix a leftover problem with LU-9671 for DOM Fixes: a4fbe7341baf12 ("LU-14739 quota: nodemap squashed root cannot bypass quota") Signed-off-by: Wang Shilong Signed-off-by: Wang Shilong Signed-off-by: Sebastien Buisson Change-Id: I3fd23da7d56acb5b485540333208e5d5b0b48023 Reviewed-on: https://review.whamcloud.com/44347 Tested-by: jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Oleg Drokin --- diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 6185f4e..1ac1c1a 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -239,6 +239,8 @@ struct client_obd { struct list_head cl_grant_chain; time64_t cl_grant_shrink_interval; /* seconds */ + int cl_root_squash; /* if root squash enabled*/ + /* A chunk is an optimal size used by osc_extent to determine * the extent size. A chunk is max(PAGE_SIZE, OST block size) */ int cl_chunkbits; diff --git a/lustre/include/uapi/linux/lustre/lustre_idl.h b/lustre/include/uapi/linux/lustre/lustre_idl.h index e89a89b..3cd00a0 100644 --- a/lustre/include/uapi/linux/lustre/lustre_idl.h +++ b/lustre/include/uapi/linux/lustre/lustre_idl.h @@ -1099,6 +1099,7 @@ enum obdo_flags { OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */ OBD_FL_FLUSH = 0x00200000, /* flush pages on the OST */ OBD_FL_SHORT_IO = 0x00400000, /* short io request */ + OBD_FL_ROOT_SQUASH = 0x00800000, /* root squash */ /* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */ /* @@ -1380,7 +1381,7 @@ struct hsm_state_set { #define OBD_BRW_GRANTED 0x40 /* the ost manages this */ /* OBD_BRW_NOCACHE is currently neither set nor tested */ #define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ -#define OBD_BRW_NOQUOTA 0x100 +#define OBD_BRW_NOQUOTA 0x100 /* do not enforce quota */ #define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ #define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */ #define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */ @@ -1392,6 +1393,7 @@ struct hsm_state_set { * it to sync quickly */ #define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */ #define OBD_BRW_RDMA_ONLY 0x20000 /* RPC contains RDMA-only pages*/ +#define OBD_BRW_SYS_RESOURCE 0x40000 /* page has CAP_SYS_RESOURCE */ #define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \ OBD_BRW_OVER_GRPQUOTA | \ diff --git a/lustre/mdt/mdt_io.c b/lustre/mdt/mdt_io.c index a368d74..2984d57 100644 --- a/lustre/mdt/mdt_io.c +++ b/lustre/mdt/mdt_io.c @@ -32,6 +32,7 @@ #include #include +#include #include "mdt_internal.h" @@ -760,10 +761,35 @@ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp, struct lu_attr *la = &info->mti_attr.ma_attr; __u64 valid; int rc = 0; + int root_squash = 0; LASSERT(mo); if (cmd == OBD_BRW_WRITE) { + struct lu_nodemap *nodemap; + __u32 mapped_uid, mapped_gid; + + nodemap = nodemap_get_from_exp(exp); + if (IS_ERR(nodemap)) + RETURN(PTR_ERR(nodemap)); + mapped_uid = nodemap_map_id(nodemap, NODEMAP_UID, + NODEMAP_FS_TO_CLIENT, + oa->o_uid); + mapped_gid = nodemap_map_id(nodemap, NODEMAP_GID, + NODEMAP_FS_TO_CLIENT, + oa->o_gid); + if (!IS_ERR_OR_NULL(nodemap)) { + /* do not bypass quota enforcement if squashed uid */ + if (unlikely(mapped_uid == nodemap->nm_squash_uid)) { + int idx; + + for (idx = 0; idx < npages; idx++) + lnb[idx].lnb_flags &= + ~OBD_BRW_SYS_RESOURCE; + root_squash = 1; + } + nodemap_putref(nodemap); + } /* Don't update timestamps if this write is older than a * setattr which modifies the timestamps. b=10150 */ @@ -812,9 +838,18 @@ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp, oa->o_flags = OBD_FL_NO_PRJQUOTA; } + if (root_squash) + oa->o_flags |= OBD_FL_ROOT_SQUASH; + oa->o_valid |= OBD_MD_FLFLAGS | OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA | OBD_MD_FLPRJQUOTA; } + /* Convert back to client IDs. LU-9671. + * nodemap_get_from_exp() may fail due to nodemap deactivated, + * server ID will be returned back to client in that case. + */ + oa->o_uid = mapped_uid; + oa->o_gid = mapped_gid; } else if (cmd == OBD_BRW_READ) { /* If oa != NULL then mdt_preprw_read updated the inode * atime and we should update the lvb so that other glimpses diff --git a/lustre/ofd/ofd_io.c b/lustre/ofd/ofd_io.c index c9811ff..ffb8a26 100644 --- a/lustre/ofd/ofd_io.c +++ b/lustre/ofd/ofd_io.c @@ -1456,6 +1456,7 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp, struct ldlm_resource *rs = NULL; __u64 valid; int rc = 0; + int root_squash = 0; LASSERT(npages > 0); @@ -1464,6 +1465,8 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp, __u32 mapped_uid, mapped_gid; nodemap = nodemap_get_from_exp(exp); + if (IS_ERR(nodemap)) + RETURN(PTR_ERR(nodemap)); mapped_uid = nodemap_map_id(nodemap, NODEMAP_UID, NODEMAP_FS_TO_CLIENT, oa->o_uid); @@ -1477,7 +1480,9 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp, int idx; for (idx = 0; idx < npages; idx++) - lnb[idx].lnb_flags &= ~OBD_BRW_NOQUOTA; + lnb[idx].lnb_flags &= + ~OBD_BRW_SYS_RESOURCE; + root_squash = 1; } nodemap_putref(nodemap); } @@ -1521,6 +1526,9 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp, oa->o_flags = OBD_FL_NO_PRJQUOTA; } + if (root_squash) + oa->o_flags |= OBD_FL_ROOT_SQUASH; + oa->o_valid |= OBD_MD_FLFLAGS; oa->o_valid |= OBD_MD_FLALLQUOTA; } diff --git a/lustre/osc/osc_cache.c b/lustre/osc/osc_cache.c index 3255696..b8fa6da 100644 --- a/lustre/osc/osc_cache.c +++ b/lustre/osc/osc_cache.c @@ -2304,11 +2304,16 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, /* Set the OBD_BRW_SRVLOCK before the page is queued. */ brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0; - if (oio->oi_cap_sys_resource || io->ci_noquota) { + if (io->ci_noquota) { brw_flags |= OBD_BRW_NOQUOTA; cmd |= OBD_BRW_NOQUOTA; } + if (oio->oi_cap_sys_resource) { + brw_flags |= OBD_BRW_SYS_RESOURCE; + cmd |= OBD_BRW_SYS_RESOURCE; + } + /* check if the file's owner/group is over quota */ if (!io->ci_noquota) { struct cl_object *obj; @@ -2325,8 +2330,20 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io, qid[USRQUOTA] = attr->cat_uid; qid[GRPQUOTA] = attr->cat_gid; qid[PRJQUOTA] = attr->cat_projid; - if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT) - rc = -EDQUOT; + /* + * if EDQUOT returned for root, we double check + * if root squash enabled or not updated from server side. + * without root squash, we should bypass quota for root. + */ + if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT) { + if (oio->oi_cap_sys_resource && + !cli->cl_root_squash) { + io->ci_noquota = 1; + rc = 0; + } else { + rc = -EDQUOT; + } + } if (rc) RETURN(rc); } diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c index 789419a..3217a1e 100644 --- a/lustre/osc/osc_page.c +++ b/lustre/osc/osc_page.c @@ -317,8 +317,8 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg, oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags; if (oio->oi_cap_sys_resource) { - oap->oap_brw_flags |= OBD_BRW_NOQUOTA; - oap->oap_cmd |= OBD_BRW_NOQUOTA; + oap->oap_brw_flags |= OBD_BRW_SYS_RESOURCE; + oap->oap_cmd |= OBD_BRW_SYS_RESOURCE; } opg->ops_submit_time = submit_time; diff --git a/lustre/osc/osc_quota.c b/lustre/osc/osc_quota.c index 4320ae4..0f07952 100644 --- a/lustre/osc/osc_quota.c +++ b/lustre/osc/osc_quota.c @@ -108,6 +108,7 @@ int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[], RETURN(0); mutex_lock(&cli->cl_quota_mutex); + cli->cl_root_squash = !!(flags & OBD_FL_ROOT_SQUASH); /* still mark the quots is running out for the old request, because it * could be processed after the new request at OST, the side effect is * the following request will be processed synchronously, but it will diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 4ca9be0..68da594 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -1158,7 +1158,8 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) if (p1->flag != p2->flag) { unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE | OBD_BRW_SYNC | OBD_BRW_ASYNC | - OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC); + OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC | + OBD_BRW_SYS_RESOURCE); /* warn if we try to combine flags that we don't know to be * safe to combine */ @@ -1974,8 +1975,8 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc) "setdq for [%u %u %u] with valid %#llx, flags %x\n", body->oa.o_uid, body->oa.o_gid, body->oa.o_projid, body->oa.o_valid, body->oa.o_flags); - osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid, - body->oa.o_flags); + osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid, + body->oa.o_flags); } osc_update_grant(cli, body); @@ -3512,6 +3513,7 @@ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg) GOTO(out_ptlrpcd_work, rc); cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL; + cli->cl_root_squash = 0; osc_update_next_shrink(cli); RETURN(rc); diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index ef428ee..4eb6c649 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1408,16 +1408,12 @@ static int osd_declare_write_commit(const struct lu_env *env, /* ignore quota for the whole request if any page is from * client cache or written by root. * - * XXX once we drop the 1.8 client support, the checking - * for whether page is from cache can be simplified as: - * !(lnb[i].flags & OBD_BRW_SYNC) - * * XXX we could handle this on per-lnb basis as done by * grant. */ if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) || - (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == - OBD_BRW_FROM_GRANT) + (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) || + !(lnb[i].lnb_flags & OBD_BRW_SYNC)) declare_flags |= OSD_QID_FORCE; /* diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index a5dbac4..ea51a91 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -664,15 +664,11 @@ static int osd_declare_write_commit(const struct lu_env *env, /* ignore quota for the whole request if any page is from * client cache or written by root. * - * XXX once we drop the 1.8 client support, the checking - * for whether page is from cache can be simplified as: - * !(lnb[i].flags & OBD_BRW_SYNC) - * * XXX we could handle this on per-lnb basis as done by * grant. */ if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) || - (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == - OBD_BRW_FROM_GRANT) + (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) || + !(lnb[i].lnb_flags & OBD_BRW_SYNC)) declare_flags |= OSD_QID_FORCE; if (size == 0) { diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index 5d9351d..93d4fe2 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -2346,6 +2346,8 @@ void lustre_assert_wire_constants(void) OBD_BRW_OVER_PRJQUOTA); LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n", OBD_BRW_RDMA_ONLY); + LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n", + OBD_BRW_SYS_RESOURCE); /* Checks for struct ost_body */ LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n", diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index 11cdb5f..596442e 100755 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -4979,27 +4979,96 @@ function cleanup_quota_test_75() cleanup_quota_test } +test_dom_75() { + local dd_failed=false + local LIMIT=20480 #20M + local qid=$TSTID + + for ((i = 0; i < $((LIMIT/2048-1)); i++)); do + $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \ + oflag=sync || dd_failed=true + done + + $dd_failed && quota_error u $qid "write failed, expect succeed (1)" + + for ((i = $((LIMIT/2048-1)); i < $((LIMIT/1024 + 10)); i++)); do + $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \ + oflag=sync || dd_failed=true + done + + $dd_failed || quota_error u $qid "write succeed, expect EDQUOT (1)" + + rm -f $DIR/$tdir_dom/* + + # flush cache, ensure noquota flag is set on client + cancel_lru_locks + sync; sync_all_data || true + + dd_failed=false + + $DD of=$DIR/$tdir/file count=$((LIMIT/2048-1)) oflag=sync || + quota_error u $qid "write failed, expect succeed (2)" + + for ((i = 0; i < $((LIMIT/2048 + 10)); i++)); do + $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \ + oflag=sync || dd_failed=true + done + + $dd_failed || quota_error u $TSTID "write succeed, expect EDQUOT (2)" + + rm -f $DIR/$tdir/* + rm -f $DIR/$tdir_dom/* + + # flush cache, ensure noquota flag is set on client + cancel_lru_locks + sync; sync_all_data || true + + dd_failed=false + + for ((i = 0; i < $((LIMIT/2048-1)); i++)); do + $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \ + oflag=sync || dd_failed=true + done + + $dd_failed && quota_error u $qid "write failed, expect succeed (3)" + + $DD of=$DIR/$tdir/file count=$((LIMIT/2048 + 10)) oflag=sync && + quota_error u $qid "write succeed, expect EDQUOT (3)" + true +} + test_75() { - local limit=10 # MB + local soft_limit=10 # MB + local hard_limit=20 # MB + local limit=$soft_limit local testfile="$DIR/$tdir/$tfile-0" + local grace=20 # seconds + local tdir_dom=${tdir}_dom + + if [ $(facet_fstype $SINGLEMDS) = "zfs" ]; then + grace=60 + fi setup_quota_test || error "setup quota failed with $?" stack_trap cleanup_quota_test_75 EXIT # enable ost quota set_ost_qtype $QTYPE || error "enable ost quota failed" + set_mdt_qtype $QTYPE || error "enable mdt quota failed" - # test for user - log "User $TSTUSR quota block hardlimit:$limit MB" - $LFS setquota -u $TSTID -b 0 -B ${limit}M -i 0 -I 0 $DIR || - error "set user quota failed" - - # make sure the system is clean local used=$(getquota -u $TSTID global curspace) - [ $used -ne 0 ] && error "Used space ($used) for user $TSTUSR not 0." + $LFS setquota -t -u --block-grace $grace --inode-grace \ + $MAX_IQ_TIME $DIR || error "set user grace time failed" + $LFS setquota -u $TSTUSR -b $((soft_limit+used/1024))M \ + -B $((hard_limit+used/1024))M -i 0 -I 0 $DIR || + error "set user quota failed" chmod 777 $DIR/$tdir || error "chmod 777 $DIR/$tdir failed" + mkdir $DIR/$tdir_dom + chmod 777 $DIR/$tdir_dom + $LFS setstripe -E 1M -L mdt $DIR/$tdir_dom || + error "setstripe $tdir_dom failed" do_facet mgs $LCTL nodemap_activate 1 wait_nm_sync active @@ -5018,10 +5087,33 @@ test_75() wait_nm_sync default trusted_nodemap wait_nm_sync default squash_uid + # mmap write when over soft limit + limit=$soft_limit + $DD of=$testfile count=${limit} || + quota_error a "root write failure, but expect success (1)" + OFFSET=$((limit * 1024)) + cancel_lru_locks osc + + echo "Write to exceed soft limit" + dd if=/dev/zero of=$testfile bs=1K count=10 seek=$OFFSET || + quota_error a $TSTUSR "root write failure, but expect success (2)" + OFFSET=$((OFFSET + 1024)) # make sure we don't write to same block + cancel_lru_locks osc + + echo "mmap write when over soft limit" + $MULTIOP $testfile.mmap OT40960SMW || + quota_error a $TSTUSR "mmap write failure, but expect success" + cancel_lru_locks osc + rm -f $testfile* + wait_delete_completed || error "wait_delete_completed failed (1)" + sync_all_data || true + + # test for user hard limit + limit=$hard_limit log "Write..." $DD of=$testfile bs=1M count=$((limit/2)) || quota_error u $TSTID \ - "root write failure, but expect success" + "root write failure, but expect success (3)" log "Write out of block quota ..." # possibly a cache write, ignore failure @@ -5036,11 +5128,12 @@ test_75() quota_error u $TSTID \ "user write success, but expect EDQUOT" rm -f $testfile - wait_delete_completed || error "wait_delete_completed failed" + wait_delete_completed || error "wait_delete_completed failed (2)" sync_all_data || true - used=$(getquota -u $TSTUSR global curspace) - [ $used -eq 0 ] || quota_error u $TSTID \ - "user quota not released after deletion" + [ $(getquota -u $TSTUSR global curspace) -eq $used ] || + quota_error u $TSTID "user quota not released after deletion" + + test_dom_75 } run_test 75 "nodemap squashed root respects quota enforcement" diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 3fa2d20..56d16cf 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -1088,6 +1088,7 @@ check_niobuf_remote(void) CHECK_DEFINE_X(OBD_BRW_SOFT_SYNC); CHECK_DEFINE_X(OBD_BRW_OVER_PRJQUOTA); CHECK_DEFINE_X(OBD_BRW_RDMA_ONLY); + CHECK_DEFINE_X(OBD_BRW_SYS_RESOURCE); } static void diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index 5503fd1..feefdcd 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -2372,6 +2372,8 @@ void lustre_assert_wire_constants(void) OBD_BRW_OVER_PRJQUOTA); LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n", OBD_BRW_RDMA_ONLY); + LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n", + OBD_BRW_SYS_RESOURCE); /* Checks for struct ost_body */ LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",