Whamcloud - gitweb
LU-14739 quota: fix quota with root squash enabled 47/44347/17
authorWang Shilong <wshilong@ddn.com>
Tue, 20 Jul 2021 02:36:31 +0000 (10:36 +0800)
committerOleg Drokin <green@whamcloud.com>
Sun, 10 Oct 2021 03:30:56 +0000 (03:30 +0000)
This patch tries to fix several problems:

1. OSD will ignore quota if IO comes from client
cache or root, however since following change:

LU-12687 osc: consume grants for direct I/O

DIO now consumes grant too, following check for
sync IO is wrong now:

(lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC))
        == OBD_BRW_FROM_GRANT)

This wass originally added to support 1.8 client, it is
going to be 2.15 now, so let's remove this broken check.

2. Server side will clear OBD_BRW_NOQUOTA if root squash
is enabled, this will revert fixes from:

"LU-13228 clio: mmap write when overquota"

We need to separate @ci_noquota and @oi_cap_sys_resource cases,
introduce a new flag OBD_BRW_SYS_RESOURCE, and extend test_75
to cover this case.

3. LU-14739 missed case that DoM quota should be considered
as well.

4. If EDQUOT is returned for root, we check the new root squash
flag OBD_FL_ROOT_SQUASH from server side. If this flag is not set,
we bypass quota for root, otherwise all root writes become sync
writes.

5. Fix a leftover problem with LU-9671 for DOM

Fixes: a4fbe7341baf12 ("LU-14739 quota: nodemap squashed root cannot bypass quota")
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: Sebastien Buisson <sbuisson@ddn.com>
Change-Id: I3fd23da7d56acb5b485540333208e5d5b0b48023
Reviewed-on: https://review.whamcloud.com/44347
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
14 files changed:
lustre/include/obd.h
lustre/include/uapi/linux/lustre/lustre_idl.h
lustre/mdt/mdt_io.c
lustre/ofd/ofd_io.c
lustre/osc/osc_cache.c
lustre/osc/osc_page.c
lustre/osc/osc_quota.c
lustre/osc/osc_request.c
lustre/osd-ldiskfs/osd_io.c
lustre/osd-zfs/osd_io.c
lustre/ptlrpc/wiretest.c
lustre/tests/sanity-quota.sh
lustre/utils/wirecheck.c
lustre/utils/wiretest.c

index 6185f4e..1ac1c1a 100644 (file)
@@ -239,6 +239,8 @@ struct client_obd {
        struct list_head        cl_grant_chain;
        time64_t                cl_grant_shrink_interval; /* seconds */
 
+       int                     cl_root_squash; /* if root squash enabled*/
+
        /* A chunk is an optimal size used by osc_extent to determine
         * the extent size. A chunk is max(PAGE_SIZE, OST block size) */
        int                     cl_chunkbits;
index e89a89b..3cd00a0 100644 (file)
@@ -1099,6 +1099,7 @@ enum obdo_flags {
         OBD_FL_NOSPC_BLK    = 0x00100000, /* no more block space on OST */
        OBD_FL_FLUSH        = 0x00200000, /* flush pages on the OST */
        OBD_FL_SHORT_IO     = 0x00400000, /* short io request */
+       OBD_FL_ROOT_SQUASH  = 0x00800000, /* root squash */
        /* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
 
        /*
@@ -1380,7 +1381,7 @@ struct hsm_state_set {
 #define OBD_BRW_GRANTED         0x40 /* the ost manages this */
 /* OBD_BRW_NOCACHE is currently neither set nor tested */
 #define OBD_BRW_NOCACHE         0x80 /* this page is a part of non-cached IO */
-#define OBD_BRW_NOQUOTA        0x100
+#define OBD_BRW_NOQUOTA        0x100 /* do not enforce quota */
 #define OBD_BRW_SRVLOCK        0x200 /* Client holds no lock over this page */
 #define OBD_BRW_ASYNC          0x400 /* Server may delay commit to disk */
 #define OBD_BRW_MEMALLOC       0x800 /* Client runs in the "kswapd" context */
@@ -1392,6 +1393,7 @@ struct hsm_state_set {
                                      * it to sync quickly */
 #define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */
 #define OBD_BRW_RDMA_ONLY    0x20000 /* RPC contains RDMA-only pages*/
+#define OBD_BRW_SYS_RESOURCE 0x40000 /* page has CAP_SYS_RESOURCE */
 
 #define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \
                               OBD_BRW_OVER_GRPQUOTA | \
index a368d74..2984d57 100644 (file)
@@ -32,6 +32,7 @@
 
 #include <dt_object.h>
 #include <linux/falloc.h>
+#include <lustre_nodemap.h>
 
 #include "mdt_internal.h"
 
@@ -760,10 +761,35 @@ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
        struct lu_attr *la = &info->mti_attr.ma_attr;
        __u64 valid;
        int rc = 0;
+       int root_squash = 0;
 
        LASSERT(mo);
 
        if (cmd == OBD_BRW_WRITE) {
+               struct lu_nodemap *nodemap;
+               __u32 mapped_uid, mapped_gid;
+
+               nodemap = nodemap_get_from_exp(exp);
+               if (IS_ERR(nodemap))
+                       RETURN(PTR_ERR(nodemap));
+               mapped_uid = nodemap_map_id(nodemap, NODEMAP_UID,
+                                           NODEMAP_FS_TO_CLIENT,
+                                           oa->o_uid);
+               mapped_gid = nodemap_map_id(nodemap, NODEMAP_GID,
+                                           NODEMAP_FS_TO_CLIENT,
+                                           oa->o_gid);
+               if (!IS_ERR_OR_NULL(nodemap)) {
+                       /* do not bypass quota enforcement if squashed uid */
+                       if (unlikely(mapped_uid == nodemap->nm_squash_uid)) {
+                               int idx;
+
+                               for (idx = 0; idx < npages; idx++)
+                                       lnb[idx].lnb_flags &=
+                                               ~OBD_BRW_SYS_RESOURCE;
+                               root_squash = 1;
+                       }
+                       nodemap_putref(nodemap);
+               }
                /* Don't update timestamps if this write is older than a
                 * setattr which modifies the timestamps. b=10150 */
 
@@ -812,9 +838,18 @@ int mdt_obd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
                                        oa->o_flags = OBD_FL_NO_PRJQUOTA;
                        }
 
+                       if (root_squash)
+                               oa->o_flags |= OBD_FL_ROOT_SQUASH;
+
                        oa->o_valid |= OBD_MD_FLFLAGS | OBD_MD_FLUSRQUOTA |
                                       OBD_MD_FLGRPQUOTA | OBD_MD_FLPRJQUOTA;
                }
+               /* Convert back to client IDs. LU-9671.
+                * nodemap_get_from_exp() may fail due to nodemap deactivated,
+                * server ID will be returned back to client in that case.
+                */
+               oa->o_uid = mapped_uid;
+               oa->o_gid = mapped_gid;
        } else if (cmd == OBD_BRW_READ) {
                /* If oa != NULL then mdt_preprw_read updated the inode
                 * atime and we should update the lvb so that other glimpses
index c9811ff..ffb8a26 100644 (file)
@@ -1456,6 +1456,7 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
        struct ldlm_resource *rs = NULL;
        __u64 valid;
        int rc = 0;
+       int root_squash = 0;
 
        LASSERT(npages > 0);
 
@@ -1464,6 +1465,8 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
                __u32 mapped_uid, mapped_gid;
 
                nodemap = nodemap_get_from_exp(exp);
+               if (IS_ERR(nodemap))
+                       RETURN(PTR_ERR(nodemap));
                mapped_uid = nodemap_map_id(nodemap, NODEMAP_UID,
                                            NODEMAP_FS_TO_CLIENT,
                                            oa->o_uid);
@@ -1477,7 +1480,9 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
                                int idx;
 
                                for (idx = 0; idx < npages; idx++)
-                                       lnb[idx].lnb_flags &= ~OBD_BRW_NOQUOTA;
+                                       lnb[idx].lnb_flags &=
+                                               ~OBD_BRW_SYS_RESOURCE;
+                               root_squash = 1;
                        }
                        nodemap_putref(nodemap);
                }
@@ -1521,6 +1526,9 @@ int ofd_commitrw(const struct lu_env *env, int cmd, struct obd_export *exp,
                                        oa->o_flags = OBD_FL_NO_PRJQUOTA;
                        }
 
+                       if (root_squash)
+                               oa->o_flags |= OBD_FL_ROOT_SQUASH;
+
                        oa->o_valid |= OBD_MD_FLFLAGS;
                        oa->o_valid |= OBD_MD_FLALLQUOTA;
                }
index 3255696..b8fa6da 100644 (file)
@@ -2304,11 +2304,16 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
 
        /* Set the OBD_BRW_SRVLOCK before the page is queued. */
        brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
-       if (oio->oi_cap_sys_resource || io->ci_noquota) {
+       if (io->ci_noquota) {
                brw_flags |= OBD_BRW_NOQUOTA;
                cmd |= OBD_BRW_NOQUOTA;
        }
 
+       if (oio->oi_cap_sys_resource) {
+               brw_flags |= OBD_BRW_SYS_RESOURCE;
+               cmd |= OBD_BRW_SYS_RESOURCE;
+       }
+
        /* check if the file's owner/group is over quota */
        if (!io->ci_noquota) {
                struct cl_object *obj;
@@ -2325,8 +2330,20 @@ int osc_queue_async_io(const struct lu_env *env, struct cl_io *io,
                qid[USRQUOTA] = attr->cat_uid;
                qid[GRPQUOTA] = attr->cat_gid;
                qid[PRJQUOTA] = attr->cat_projid;
-               if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT)
-                       rc = -EDQUOT;
+               /*
+                * if EDQUOT returned for root, we double check
+                * if root squash enabled or not updated from server side.
+                * without root squash, we should bypass quota for root.
+                */
+               if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT) {
+                       if (oio->oi_cap_sys_resource &&
+                           !cli->cl_root_squash) {
+                               io->ci_noquota = 1;
+                               rc = 0;
+                       } else {
+                               rc = -EDQUOT;
+                       }
+               }
                if (rc)
                        RETURN(rc);
        }
index 789419a..3217a1e 100644 (file)
@@ -317,8 +317,8 @@ void osc_page_submit(const struct lu_env *env, struct osc_page *opg,
        oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
 
        if (oio->oi_cap_sys_resource) {
-               oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
-               oap->oap_cmd |= OBD_BRW_NOQUOTA;
+               oap->oap_brw_flags |= OBD_BRW_SYS_RESOURCE;
+               oap->oap_cmd |= OBD_BRW_SYS_RESOURCE;
        }
 
        opg->ops_submit_time = submit_time;
index 4320ae4..0f07952 100644 (file)
@@ -108,6 +108,7 @@ int osc_quota_setdq(struct client_obd *cli, __u64 xid, const unsigned int qid[],
                RETURN(0);
 
        mutex_lock(&cli->cl_quota_mutex);
+       cli->cl_root_squash = !!(flags & OBD_FL_ROOT_SQUASH);
        /* still mark the quots is running out for the old request, because it
         * could be processed after the new request at OST, the side effect is
         * the following request will be processed synchronously, but it will
index 4ca9be0..68da594 100644 (file)
@@ -1158,7 +1158,8 @@ static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2)
         if (p1->flag != p2->flag) {
                unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
                                  OBD_BRW_SYNC       | OBD_BRW_ASYNC   |
-                                 OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC);
+                                 OBD_BRW_NOQUOTA    | OBD_BRW_SOFT_SYNC |
+                                 OBD_BRW_SYS_RESOURCE);
 
                 /* warn if we try to combine flags that we don't know to be
                  * safe to combine */
@@ -1974,8 +1975,8 @@ static int osc_brw_fini_request(struct ptlrpc_request *req, int rc)
                       "setdq for [%u %u %u] with valid %#llx, flags %x\n",
                       body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
                       body->oa.o_valid, body->oa.o_flags);
-                      osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
-                                      body->oa.o_flags);
+               osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
+                               body->oa.o_flags);
        }
 
        osc_update_grant(cli, body);
@@ -3512,6 +3513,7 @@ int osc_setup_common(struct obd_device *obd, struct lustre_cfg *lcfg)
                GOTO(out_ptlrpcd_work, rc);
 
        cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+       cli->cl_root_squash = 0;
        osc_update_next_shrink(cli);
 
        RETURN(rc);
index ef428ee..4eb6c64 100644 (file)
@@ -1408,16 +1408,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
                /* ignore quota for the whole request if any page is from
                 * client cache or written by root.
                 *
-                * XXX once we drop the 1.8 client support, the checking
-                * for whether page is from cache can be simplified as:
-                * !(lnb[i].flags & OBD_BRW_SYNC)
-                *
                 * XXX we could handle this on per-lnb basis as done by
                 * grant.
                 */
                if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
-                   (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
-                   OBD_BRW_FROM_GRANT)
+                   (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+                   !(lnb[i].lnb_flags & OBD_BRW_SYNC))
                        declare_flags |= OSD_QID_FORCE;
 
                /*
index a5dbac4..ea51a91 100644 (file)
@@ -664,15 +664,11 @@ static int osd_declare_write_commit(const struct lu_env *env,
                /* ignore quota for the whole request if any page is from
                 * client cache or written by root.
                 *
-                * XXX once we drop the 1.8 client support, the checking
-                * for whether page is from cache can be simplified as:
-                * !(lnb[i].flags & OBD_BRW_SYNC)
-                *
                 * XXX we could handle this on per-lnb basis as done by
                 * grant. */
                if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
-                   (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
-                   OBD_BRW_FROM_GRANT)
+                   (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+                   !(lnb[i].lnb_flags & OBD_BRW_SYNC))
                        declare_flags |= OSD_QID_FORCE;
 
                if (size == 0) {
index 5d9351d..93d4fe2 100644 (file)
@@ -2346,6 +2346,8 @@ void lustre_assert_wire_constants(void)
                OBD_BRW_OVER_PRJQUOTA);
        LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n",
                OBD_BRW_RDMA_ONLY);
+       LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n",
+               OBD_BRW_SYS_RESOURCE);
 
        /* Checks for struct ost_body */
        LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
index 11cdb5f..596442e 100755 (executable)
@@ -4979,27 +4979,96 @@ function cleanup_quota_test_75()
        cleanup_quota_test
 }
 
+test_dom_75() {
+       local dd_failed=false
+       local LIMIT=20480 #20M
+       local qid=$TSTID
+
+       for ((i = 0; i < $((LIMIT/2048-1)); i++)); do
+               $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+                       oflag=sync || dd_failed=true
+       done
+
+       $dd_failed && quota_error u $qid "write failed, expect succeed (1)"
+
+       for ((i = $((LIMIT/2048-1)); i < $((LIMIT/1024 + 10)); i++)); do
+               $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+                       oflag=sync || dd_failed=true
+       done
+
+       $dd_failed || quota_error u $qid "write succeed, expect EDQUOT (1)"
+
+       rm -f $DIR/$tdir_dom/*
+
+       # flush cache, ensure noquota flag is set on client
+       cancel_lru_locks
+       sync; sync_all_data || true
+
+       dd_failed=false
+
+       $DD of=$DIR/$tdir/file count=$((LIMIT/2048-1)) oflag=sync ||
+               quota_error u $qid "write failed, expect succeed (2)"
+
+       for ((i = 0; i < $((LIMIT/2048 + 10)); i++)); do
+               $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+                       oflag=sync || dd_failed=true
+       done
+
+       $dd_failed || quota_error u $TSTID "write succeed, expect EDQUOT (2)"
+
+       rm -f $DIR/$tdir/*
+       rm -f $DIR/$tdir_dom/*
+
+       # flush cache, ensure noquota flag is set on client
+       cancel_lru_locks
+       sync; sync_all_data || true
+
+       dd_failed=false
+
+       for ((i = 0; i < $((LIMIT/2048-1)); i++)); do
+               $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+                       oflag=sync || dd_failed=true
+       done
+
+       $dd_failed && quota_error u $qid "write failed, expect succeed (3)"
+
+       $DD of=$DIR/$tdir/file count=$((LIMIT/2048 + 10)) oflag=sync &&
+               quota_error u $qid "write succeed, expect EDQUOT (3)"
+       true
+}
+
 test_75()
 {
-       local limit=10 # MB
+       local soft_limit=10 # MB
+       local hard_limit=20 # MB
+       local limit=$soft_limit
        local testfile="$DIR/$tdir/$tfile-0"
+       local grace=20 # seconds
+       local tdir_dom=${tdir}_dom
+
+       if [ $(facet_fstype $SINGLEMDS) = "zfs" ]; then
+           grace=60
+       fi
 
        setup_quota_test || error "setup quota failed with $?"
        stack_trap cleanup_quota_test_75 EXIT
 
        # enable ost quota
        set_ost_qtype $QTYPE || error "enable ost quota failed"
+       set_mdt_qtype $QTYPE || error "enable mdt quota failed"
 
-       # test for user
-       log "User $TSTUSR quota block hardlimit:$limit MB"
-       $LFS setquota -u $TSTID -b 0 -B ${limit}M -i 0 -I 0 $DIR ||
-               error "set user quota failed"
-
-       # make sure the system is clean
        local used=$(getquota -u $TSTID global curspace)
-       [ $used -ne 0 ] && error "Used space ($used) for user $TSTUSR not 0."
+       $LFS setquota -t -u --block-grace $grace --inode-grace \
+               $MAX_IQ_TIME $DIR || error "set user grace time failed"
+       $LFS setquota -u $TSTUSR -b $((soft_limit+used/1024))M \
+                       -B $((hard_limit+used/1024))M -i 0 -I 0 $DIR ||
+               error "set user quota failed"
 
        chmod 777 $DIR/$tdir || error "chmod 777 $DIR/$tdir failed"
+       mkdir $DIR/$tdir_dom
+       chmod 777 $DIR/$tdir_dom
+       $LFS setstripe -E 1M -L mdt $DIR/$tdir_dom ||
+               error "setstripe $tdir_dom failed"
 
        do_facet mgs $LCTL nodemap_activate 1
        wait_nm_sync active
@@ -5018,10 +5087,33 @@ test_75()
        wait_nm_sync default trusted_nodemap
        wait_nm_sync default squash_uid
 
+       # mmap write when over soft limit
+       limit=$soft_limit
+       $DD of=$testfile count=${limit} ||
+               quota_error a  "root write failure, but expect success (1)"
+       OFFSET=$((limit * 1024))
+       cancel_lru_locks osc
+
+       echo "Write to exceed soft limit"
+       dd if=/dev/zero of=$testfile bs=1K count=10 seek=$OFFSET ||
+             quota_error a $TSTUSR "root write failure, but expect success (2)"
+       OFFSET=$((OFFSET + 1024)) # make sure we don't write to same block
+       cancel_lru_locks osc
+
+       echo "mmap write when over soft limit"
+       $MULTIOP $testfile.mmap OT40960SMW ||
+               quota_error a $TSTUSR "mmap write failure, but expect success"
+       cancel_lru_locks osc
+       rm -f $testfile*
+       wait_delete_completed || error "wait_delete_completed failed (1)"
+       sync_all_data || true
+
+       # test for user hard limit
+       limit=$hard_limit
        log "Write..."
        $DD of=$testfile bs=1M count=$((limit/2)) ||
                quota_error u $TSTID \
-                       "root write failure, but expect success"
+                       "root write failure, but expect success (3)"
 
        log "Write out of block quota ..."
        # possibly a cache write, ignore failure
@@ -5036,11 +5128,12 @@ test_75()
                quota_error u $TSTID \
                        "user write success, but expect EDQUOT"
        rm -f $testfile
-       wait_delete_completed || error "wait_delete_completed failed"
+       wait_delete_completed || error "wait_delete_completed failed (2)"
        sync_all_data || true
-       used=$(getquota -u $TSTUSR global curspace)
-       [ $used -eq 0 ] || quota_error u $TSTID \
-               "user quota not released after deletion"
+       [ $(getquota -u $TSTUSR global curspace) -eq $used ] ||
+               quota_error u $TSTID "user quota not released after deletion"
+
+       test_dom_75
 }
 run_test 75 "nodemap squashed root respects quota enforcement"
 
index 3fa2d20..56d16cf 100644 (file)
@@ -1088,6 +1088,7 @@ check_niobuf_remote(void)
        CHECK_DEFINE_X(OBD_BRW_SOFT_SYNC);
        CHECK_DEFINE_X(OBD_BRW_OVER_PRJQUOTA);
        CHECK_DEFINE_X(OBD_BRW_RDMA_ONLY);
+       CHECK_DEFINE_X(OBD_BRW_SYS_RESOURCE);
 }
 
 static void
index 5503fd1..feefdcd 100644 (file)
@@ -2372,6 +2372,8 @@ void lustre_assert_wire_constants(void)
                OBD_BRW_OVER_PRJQUOTA);
        LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n",
                OBD_BRW_RDMA_ONLY);
+       LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n",
+               OBD_BRW_SYS_RESOURCE);
 
        /* Checks for struct ost_body */
        LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",