This patch tries to fix several problems:
1. OSD will ignore quota if IO comes from client
cache or root, however since following change:
LU-12687 osc: consume grants for direct I/O
DIO now consumes grant too, following check for
sync IO is wrong now:
(lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC))
== OBD_BRW_FROM_GRANT)
This wass originally added to support 1.8 client, it is
going to be 2.15 now, so let's remove this broken check.
2. Server side will clear OBD_BRW_NOQUOTA if root squash
is enabled, this will revert fixes from:
"LU-13228 clio: mmap write when overquota"
We need to separate @ci_noquota and @oi_cap_sys_resource cases,
introduce a new flag OBD_BRW_SYS_RESOURCE, and extend test_75
to cover this case.
3. LU-14739 missed case that DoM quota should be considered
as well.
4. If EDQUOT is returned for root, we check the new root squash
flag OBD_FL_ROOT_SQUASH from server side. If this flag is not set,
we bypass quota for root, otherwise all root writes become sync
writes.
5. Fix a leftover problem with LU-9671 for DOM
Fixes:
a4fbe7341baf12 ("LU-14739 quota: nodemap squashed root cannot bypass quota")
Signed-off-by: Wang Shilong <wshilong@ddn.com>
Signed-off-by: Wang Shilong <wangshilong1991@gmail.com>
Signed-off-by: Sebastien Buisson <sbuisson@ddn.com>
Change-Id: I3fd23da7d56acb5b485540333208e5d5b0b48023
Reviewed-on: https://review.whamcloud.com/44347
Tested-by: jenkins <devops@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
struct list_head cl_grant_chain;
time64_t cl_grant_shrink_interval; /* seconds */
+ int cl_root_squash; /* if root squash enabled*/
+
/* A chunk is an optimal size used by osc_extent to determine
* the extent size. A chunk is max(PAGE_SIZE, OST block size) */
int cl_chunkbits;
OBD_FL_NOSPC_BLK = 0x00100000, /* no more block space on OST */
OBD_FL_FLUSH = 0x00200000, /* flush pages on the OST */
OBD_FL_SHORT_IO = 0x00400000, /* short io request */
+ OBD_FL_ROOT_SQUASH = 0x00800000, /* root squash */
/* OBD_FL_LOCAL_MASK = 0xF0000000, was local-only flags until 2.10 */
/*
#define OBD_BRW_GRANTED 0x40 /* the ost manages this */
/* OBD_BRW_NOCACHE is currently neither set nor tested */
#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */
-#define OBD_BRW_NOQUOTA 0x100
+#define OBD_BRW_NOQUOTA 0x100 /* do not enforce quota */
#define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */
#define OBD_BRW_ASYNC 0x400 /* Server may delay commit to disk */
#define OBD_BRW_MEMALLOC 0x800 /* Client runs in the "kswapd" context */
* it to sync quickly */
#define OBD_BRW_OVER_PRJQUOTA 0x8000 /* Running out of project quota */
#define OBD_BRW_RDMA_ONLY 0x20000 /* RPC contains RDMA-only pages*/
+#define OBD_BRW_SYS_RESOURCE 0x40000 /* page has CAP_SYS_RESOURCE */
#define OBD_BRW_OVER_ALLQUOTA (OBD_BRW_OVER_USRQUOTA | \
OBD_BRW_OVER_GRPQUOTA | \
#include <dt_object.h>
#include <linux/falloc.h>
+#include <lustre_nodemap.h>
#include "mdt_internal.h"
struct lu_attr *la = &info->mti_attr.ma_attr;
__u64 valid;
int rc = 0;
+ int root_squash = 0;
LASSERT(mo);
if (cmd == OBD_BRW_WRITE) {
+ struct lu_nodemap *nodemap;
+ __u32 mapped_uid, mapped_gid;
+
+ nodemap = nodemap_get_from_exp(exp);
+ if (IS_ERR(nodemap))
+ RETURN(PTR_ERR(nodemap));
+ mapped_uid = nodemap_map_id(nodemap, NODEMAP_UID,
+ NODEMAP_FS_TO_CLIENT,
+ oa->o_uid);
+ mapped_gid = nodemap_map_id(nodemap, NODEMAP_GID,
+ NODEMAP_FS_TO_CLIENT,
+ oa->o_gid);
+ if (!IS_ERR_OR_NULL(nodemap)) {
+ /* do not bypass quota enforcement if squashed uid */
+ if (unlikely(mapped_uid == nodemap->nm_squash_uid)) {
+ int idx;
+
+ for (idx = 0; idx < npages; idx++)
+ lnb[idx].lnb_flags &=
+ ~OBD_BRW_SYS_RESOURCE;
+ root_squash = 1;
+ }
+ nodemap_putref(nodemap);
+ }
/* Don't update timestamps if this write is older than a
* setattr which modifies the timestamps. b=10150 */
oa->o_flags = OBD_FL_NO_PRJQUOTA;
}
+ if (root_squash)
+ oa->o_flags |= OBD_FL_ROOT_SQUASH;
+
oa->o_valid |= OBD_MD_FLFLAGS | OBD_MD_FLUSRQUOTA |
OBD_MD_FLGRPQUOTA | OBD_MD_FLPRJQUOTA;
}
+ /* Convert back to client IDs. LU-9671.
+ * nodemap_get_from_exp() may fail due to nodemap deactivated,
+ * server ID will be returned back to client in that case.
+ */
+ oa->o_uid = mapped_uid;
+ oa->o_gid = mapped_gid;
} else if (cmd == OBD_BRW_READ) {
/* If oa != NULL then mdt_preprw_read updated the inode
* atime and we should update the lvb so that other glimpses
struct ldlm_resource *rs = NULL;
__u64 valid;
int rc = 0;
+ int root_squash = 0;
LASSERT(npages > 0);
__u32 mapped_uid, mapped_gid;
nodemap = nodemap_get_from_exp(exp);
+ if (IS_ERR(nodemap))
+ RETURN(PTR_ERR(nodemap));
mapped_uid = nodemap_map_id(nodemap, NODEMAP_UID,
NODEMAP_FS_TO_CLIENT,
oa->o_uid);
int idx;
for (idx = 0; idx < npages; idx++)
- lnb[idx].lnb_flags &= ~OBD_BRW_NOQUOTA;
+ lnb[idx].lnb_flags &=
+ ~OBD_BRW_SYS_RESOURCE;
+ root_squash = 1;
}
nodemap_putref(nodemap);
}
oa->o_flags = OBD_FL_NO_PRJQUOTA;
}
+ if (root_squash)
+ oa->o_flags |= OBD_FL_ROOT_SQUASH;
+
oa->o_valid |= OBD_MD_FLFLAGS;
oa->o_valid |= OBD_MD_FLALLQUOTA;
}
/* Set the OBD_BRW_SRVLOCK before the page is queued. */
brw_flags |= ops->ops_srvlock ? OBD_BRW_SRVLOCK : 0;
- if (oio->oi_cap_sys_resource || io->ci_noquota) {
+ if (io->ci_noquota) {
brw_flags |= OBD_BRW_NOQUOTA;
cmd |= OBD_BRW_NOQUOTA;
}
+ if (oio->oi_cap_sys_resource) {
+ brw_flags |= OBD_BRW_SYS_RESOURCE;
+ cmd |= OBD_BRW_SYS_RESOURCE;
+ }
+
/* check if the file's owner/group is over quota */
if (!io->ci_noquota) {
struct cl_object *obj;
qid[USRQUOTA] = attr->cat_uid;
qid[GRPQUOTA] = attr->cat_gid;
qid[PRJQUOTA] = attr->cat_projid;
- if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT)
- rc = -EDQUOT;
+ /*
+ * if EDQUOT returned for root, we double check
+ * if root squash enabled or not updated from server side.
+ * without root squash, we should bypass quota for root.
+ */
+ if (rc == 0 && osc_quota_chkdq(cli, qid) == -EDQUOT) {
+ if (oio->oi_cap_sys_resource &&
+ !cli->cl_root_squash) {
+ io->ci_noquota = 1;
+ rc = 0;
+ } else {
+ rc = -EDQUOT;
+ }
+ }
if (rc)
RETURN(rc);
}
oap->oap_brw_flags = OBD_BRW_SYNC | brw_flags;
if (oio->oi_cap_sys_resource) {
- oap->oap_brw_flags |= OBD_BRW_NOQUOTA;
- oap->oap_cmd |= OBD_BRW_NOQUOTA;
+ oap->oap_brw_flags |= OBD_BRW_SYS_RESOURCE;
+ oap->oap_cmd |= OBD_BRW_SYS_RESOURCE;
}
opg->ops_submit_time = submit_time;
RETURN(0);
mutex_lock(&cli->cl_quota_mutex);
+ cli->cl_root_squash = !!(flags & OBD_FL_ROOT_SQUASH);
/* still mark the quots is running out for the old request, because it
* could be processed after the new request at OST, the side effect is
* the following request will be processed synchronously, but it will
if (p1->flag != p2->flag) {
unsigned mask = ~(OBD_BRW_FROM_GRANT | OBD_BRW_NOCACHE |
OBD_BRW_SYNC | OBD_BRW_ASYNC |
- OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC);
+ OBD_BRW_NOQUOTA | OBD_BRW_SOFT_SYNC |
+ OBD_BRW_SYS_RESOURCE);
/* warn if we try to combine flags that we don't know to be
* safe to combine */
"setdq for [%u %u %u] with valid %#llx, flags %x\n",
body->oa.o_uid, body->oa.o_gid, body->oa.o_projid,
body->oa.o_valid, body->oa.o_flags);
- osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
- body->oa.o_flags);
+ osc_quota_setdq(cli, req->rq_xid, qid, body->oa.o_valid,
+ body->oa.o_flags);
}
osc_update_grant(cli, body);
GOTO(out_ptlrpcd_work, rc);
cli->cl_grant_shrink_interval = GRANT_SHRINK_INTERVAL;
+ cli->cl_root_squash = 0;
osc_update_next_shrink(cli);
RETURN(rc);
/* ignore quota for the whole request if any page is from
* client cache or written by root.
*
- * XXX once we drop the 1.8 client support, the checking
- * for whether page is from cache can be simplified as:
- * !(lnb[i].flags & OBD_BRW_SYNC)
- *
* XXX we could handle this on per-lnb basis as done by
* grant.
*/
if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
- (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
- OBD_BRW_FROM_GRANT)
+ (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+ !(lnb[i].lnb_flags & OBD_BRW_SYNC))
declare_flags |= OSD_QID_FORCE;
/*
/* ignore quota for the whole request if any page is from
* client cache or written by root.
*
- * XXX once we drop the 1.8 client support, the checking
- * for whether page is from cache can be simplified as:
- * !(lnb[i].flags & OBD_BRW_SYNC)
- *
* XXX we could handle this on per-lnb basis as done by
* grant. */
if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
- (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
- OBD_BRW_FROM_GRANT)
+ (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+ !(lnb[i].lnb_flags & OBD_BRW_SYNC))
declare_flags |= OSD_QID_FORCE;
if (size == 0) {
OBD_BRW_OVER_PRJQUOTA);
LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n",
OBD_BRW_RDMA_ONLY);
+ LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n",
+ OBD_BRW_SYS_RESOURCE);
/* Checks for struct ost_body */
LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",
cleanup_quota_test
}
+test_dom_75() {
+ local dd_failed=false
+ local LIMIT=20480 #20M
+ local qid=$TSTID
+
+ for ((i = 0; i < $((LIMIT/2048-1)); i++)); do
+ $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+ oflag=sync || dd_failed=true
+ done
+
+ $dd_failed && quota_error u $qid "write failed, expect succeed (1)"
+
+ for ((i = $((LIMIT/2048-1)); i < $((LIMIT/1024 + 10)); i++)); do
+ $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+ oflag=sync || dd_failed=true
+ done
+
+ $dd_failed || quota_error u $qid "write succeed, expect EDQUOT (1)"
+
+ rm -f $DIR/$tdir_dom/*
+
+ # flush cache, ensure noquota flag is set on client
+ cancel_lru_locks
+ sync; sync_all_data || true
+
+ dd_failed=false
+
+ $DD of=$DIR/$tdir/file count=$((LIMIT/2048-1)) oflag=sync ||
+ quota_error u $qid "write failed, expect succeed (2)"
+
+ for ((i = 0; i < $((LIMIT/2048 + 10)); i++)); do
+ $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+ oflag=sync || dd_failed=true
+ done
+
+ $dd_failed || quota_error u $TSTID "write succeed, expect EDQUOT (2)"
+
+ rm -f $DIR/$tdir/*
+ rm -f $DIR/$tdir_dom/*
+
+ # flush cache, ensure noquota flag is set on client
+ cancel_lru_locks
+ sync; sync_all_data || true
+
+ dd_failed=false
+
+ for ((i = 0; i < $((LIMIT/2048-1)); i++)); do
+ $DD of=$DIR/$tdir_dom/$tfile-$i count=1 \
+ oflag=sync || dd_failed=true
+ done
+
+ $dd_failed && quota_error u $qid "write failed, expect succeed (3)"
+
+ $DD of=$DIR/$tdir/file count=$((LIMIT/2048 + 10)) oflag=sync &&
+ quota_error u $qid "write succeed, expect EDQUOT (3)"
+ true
+}
+
test_75()
{
- local limit=10 # MB
+ local soft_limit=10 # MB
+ local hard_limit=20 # MB
+ local limit=$soft_limit
local testfile="$DIR/$tdir/$tfile-0"
+ local grace=20 # seconds
+ local tdir_dom=${tdir}_dom
+
+ if [ $(facet_fstype $SINGLEMDS) = "zfs" ]; then
+ grace=60
+ fi
setup_quota_test || error "setup quota failed with $?"
stack_trap cleanup_quota_test_75 EXIT
# enable ost quota
set_ost_qtype $QTYPE || error "enable ost quota failed"
+ set_mdt_qtype $QTYPE || error "enable mdt quota failed"
- # test for user
- log "User $TSTUSR quota block hardlimit:$limit MB"
- $LFS setquota -u $TSTID -b 0 -B ${limit}M -i 0 -I 0 $DIR ||
- error "set user quota failed"
-
- # make sure the system is clean
local used=$(getquota -u $TSTID global curspace)
- [ $used -ne 0 ] && error "Used space ($used) for user $TSTUSR not 0."
+ $LFS setquota -t -u --block-grace $grace --inode-grace \
+ $MAX_IQ_TIME $DIR || error "set user grace time failed"
+ $LFS setquota -u $TSTUSR -b $((soft_limit+used/1024))M \
+ -B $((hard_limit+used/1024))M -i 0 -I 0 $DIR ||
+ error "set user quota failed"
chmod 777 $DIR/$tdir || error "chmod 777 $DIR/$tdir failed"
+ mkdir $DIR/$tdir_dom
+ chmod 777 $DIR/$tdir_dom
+ $LFS setstripe -E 1M -L mdt $DIR/$tdir_dom ||
+ error "setstripe $tdir_dom failed"
do_facet mgs $LCTL nodemap_activate 1
wait_nm_sync active
wait_nm_sync default trusted_nodemap
wait_nm_sync default squash_uid
+ # mmap write when over soft limit
+ limit=$soft_limit
+ $DD of=$testfile count=${limit} ||
+ quota_error a "root write failure, but expect success (1)"
+ OFFSET=$((limit * 1024))
+ cancel_lru_locks osc
+
+ echo "Write to exceed soft limit"
+ dd if=/dev/zero of=$testfile bs=1K count=10 seek=$OFFSET ||
+ quota_error a $TSTUSR "root write failure, but expect success (2)"
+ OFFSET=$((OFFSET + 1024)) # make sure we don't write to same block
+ cancel_lru_locks osc
+
+ echo "mmap write when over soft limit"
+ $MULTIOP $testfile.mmap OT40960SMW ||
+ quota_error a $TSTUSR "mmap write failure, but expect success"
+ cancel_lru_locks osc
+ rm -f $testfile*
+ wait_delete_completed || error "wait_delete_completed failed (1)"
+ sync_all_data || true
+
+ # test for user hard limit
+ limit=$hard_limit
log "Write..."
$DD of=$testfile bs=1M count=$((limit/2)) ||
quota_error u $TSTID \
- "root write failure, but expect success"
+ "root write failure, but expect success (3)"
log "Write out of block quota ..."
# possibly a cache write, ignore failure
quota_error u $TSTID \
"user write success, but expect EDQUOT"
rm -f $testfile
- wait_delete_completed || error "wait_delete_completed failed"
+ wait_delete_completed || error "wait_delete_completed failed (2)"
sync_all_data || true
- used=$(getquota -u $TSTUSR global curspace)
- [ $used -eq 0 ] || quota_error u $TSTID \
- "user quota not released after deletion"
+ [ $(getquota -u $TSTUSR global curspace) -eq $used ] ||
+ quota_error u $TSTID "user quota not released after deletion"
+
+ test_dom_75
}
run_test 75 "nodemap squashed root respects quota enforcement"
CHECK_DEFINE_X(OBD_BRW_SOFT_SYNC);
CHECK_DEFINE_X(OBD_BRW_OVER_PRJQUOTA);
CHECK_DEFINE_X(OBD_BRW_RDMA_ONLY);
+ CHECK_DEFINE_X(OBD_BRW_SYS_RESOURCE);
}
static void
OBD_BRW_OVER_PRJQUOTA);
LASSERTF(OBD_BRW_RDMA_ONLY == 0x20000, "found 0x%.8x\n",
OBD_BRW_RDMA_ONLY);
+ LASSERTF(OBD_BRW_SYS_RESOURCE == 0x40000, "found 0x%.8x\n",
+ OBD_BRW_SYS_RESOURCE);
/* Checks for struct ost_body */
LASSERTF((int)sizeof(struct ost_body) == 208, "found %lld\n",