#ifdef __KERNEL__
+/*
+ * There exists a potential lock inversion deadlock scenario when using
+ * Lustre on top of ZFS. This occurs between one of ZFS's
+ * buf_hash_table.ht_lock's, and Lustre's lu_sites_guard lock. Essentially,
+ * thread A will take the lu_sites_guard lock and sleep on the ht_lock,
+ * while thread B will take the ht_lock and sleep on the lu_sites_guard
+ * lock. Obviously neither thread will wake and drop their respective hold
+ * on their lock.
+ *
+ * To prevent this from happening we must ensure the lu_sites_guard lock is
+ * not taken while down this code path. ZFS reliably does not set the
+ * __GFP_FS bit in its code paths, so this can be used to determine if it
+ * is safe to take the lu_sites_guard lock.
+ *
+ * Ideally we should accurately return the remaining number of cached
+ * objects without taking the lu_sites_guard lock, but this is not
+ * possible in the current implementation.
+ */
static int lu_cache_shrink(SHRINKER_ARGS(sc, nr_to_scan, gfp_mask))
{
lu_site_stats_t stats;
int remain = shrink_param(sc, nr_to_scan);
CFS_LIST_HEAD(splice);
- if (remain != 0) {
- if (!(shrink_param(sc, gfp_mask) & __GFP_FS))
+ if (!(shrink_param(sc, gfp_mask) & __GFP_FS)) {
+ if (remain != 0)
return -1;
- CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+ else
+ /* We must not take the lu_sites_guard lock when
+ * __GFP_FS is *not* set because of the deadlock
+ * possibility detailed above. Additionally,
+ * since we cannot determine the number of
+ * objects in the cache without taking this
+ * lock, we're in a particularly tough spot. As
+ * a result, we'll just lie and say our cache is
+ * empty. This _should_ be ok, as we can't
+ * reclaim objects when __GFP_FS is *not* set
+ * anyways.
+ */
+ return 0;
}
+ CDEBUG(D_INODE, "Shrink %d objects\n", remain);
+
cfs_mutex_lock(&lu_sites_guard);
cfs_list_for_each_entry_safe(s, tmp, &lu_sites, ls_linkage) {
if (shrink_param(sc, nr_to_scan) != 0) {
}
ma->ma_attr.la_mode = mode;
- ma->ma_attr.la_valid = LA_CTIME;
+ ma->ma_attr.la_valid = LA_CTIME | LA_MODE;
ma->ma_attr.la_ctime = cfs_time_current_64();
if (name != NULL) {
osfs->os_blocks, osfs->os_bfree, osfs->os_bavail,
osfs->os_files, osfs->os_ffree, osfs->os_state);
- if (OBD_FAIL_CHECK_VALUE(OBD_FAIL_OST_ENOSPC,
- ofd->ofd_lut.lut_lsd.lsd_ost_index))
- osfs->os_bfree = osfs->os_bavail = 2;
-
if (OBD_FAIL_CHECK_VALUE(OBD_FAIL_OST_ENOINO,
ofd->ofd_lut.lut_lsd.lsd_ost_index))
osfs->os_ffree = 0;
osfs->os_bsize = 1 << COMPAT_BSIZE_SHIFT;
}
+ if (OBD_FAIL_CHECK_VALUE(OBD_FAIL_OST_ENOSPC,
+ ofd->ofd_lut.lut_lsd.lsd_ost_index))
+ osfs->os_bfree = osfs->os_bavail = 2;
+
EXIT;
out:
return rc;
RETURN(0);
}
+static void osd_xattr_changed_cb(void *arg, uint64_t newval)
+{
+ struct osd_device *osd = arg;
+
+ osd->od_xattr_in_sa = (newval == ZFS_XATTR_SA);
+}
+
static int osd_mount(const struct lu_env *env,
struct osd_device *o, struct lustre_cfg *cfg)
{
+ struct dsl_dataset *ds;
char *dev = lustre_cfg_string(cfg, 1);
dmu_buf_t *rootdb;
int rc;
RETURN(rc);
}
+ ds = dmu_objset_ds(o->od_objset.os);
+ LASSERT(ds);
+ rc = dsl_prop_register(ds, "xattr", osd_xattr_changed_cb, o);
+ if (rc)
+ CERROR("%s: cat not register xattr callback, ignore: %d\n",
+ o->od_svname, rc);
+
rc = __osd_obj2dbuf(env, o->od_objset.os, o->od_objset.root,
&rootdb, root_tag);
if (rc) {
struct lu_device *d)
{
struct osd_device *o = osd_dev(d);
+ struct dsl_dataset *ds;
int rc;
ENTRY;
osd_oi_fini(env, o);
if (o->od_objset.os) {
+ ds = dmu_objset_ds(o->od_objset.os);
+ rc = dsl_prop_unregister(ds, "xattr", osd_xattr_changed_cb, o);
+ if (rc)
+ CERROR("%s: dsl_prop_unregister xattr error %d\n",
+ o->od_svname, rc);
arc_remove_prune_callback(o->arc_prune_cb);
o->arc_prune_cb = NULL;
osd_sync(env, lu2dt_dev(d));
char oti_str[64];
char oti_key[MAXNAMELEN + 1];
+ struct lustre_mdt_attrs oti_mdt_attrs;
struct lu_attr oti_la;
struct osa_attr oti_osa;
uint64_t od_ost_compat_grp0;
unsigned int od_rdonly:1,
+ od_xattr_in_sa:1,
od_quota_iused_est:1;
char od_mntdev[128];
char od_svname[128];
struct lustre_capa *capa);
int osd_xattr_list(const struct lu_env *env, struct dt_object *dt,
struct lu_buf *lb, struct lustre_capa *capa);
+void __osd_xattr_declare_set(const struct lu_env *env, struct osd_object *obj,
+ int vallen, const char *name, struct osd_thandle *oh);
+int __osd_sa_xattr_set(const struct lu_env *env, struct osd_object *obj,
+ const struct lu_buf *buf, const char *name, int fl,
+ struct osd_thandle *oh);;
+int __osd_xattr_set(const struct lu_env *env, struct osd_object *obj,
+ const struct lu_buf *buf, const char *name, int fl,
+ struct osd_thandle *oh);
+static inline int
+osd_xattr_set_internal(const struct lu_env *env, struct osd_object *obj,
+ const struct lu_buf *buf, const char *name, int fl,
+ struct osd_thandle *oh, struct lustre_capa *capa)
+{
+ int rc;
+
+ if (osd_obj2dev(obj)->od_xattr_in_sa) {
+ rc = __osd_sa_xattr_set(env, obj, buf, name, fl, oh);
+ if (rc == -EFBIG)
+ rc = __osd_xattr_set(env, obj, buf, name, fl, oh);
+ } else {
+ rc = __osd_xattr_set(env, obj, buf, name, fl, oh);
+ }
+
+ return rc;
+}
#endif
#endif /* _OSD_INTERNAL_H */
return;
cfs_down(&oh->ot_sa_lock);
+ cfs_write_lock(&obj->oo_attr_lock);
if (likely(cfs_list_empty(&obj->oo_sa_linkage)))
cfs_list_add(&obj->oo_sa_linkage, &oh->ot_sa_list);
+ cfs_write_unlock(&obj->oo_attr_lock);
cfs_up(&oh->ot_sa_lock);
}
obj = cfs_list_entry(oh->ot_sa_list.next,
struct osd_object, oo_sa_linkage);
sa_spill_rele(obj->oo_sa_hdl);
+ cfs_write_lock(&obj->oo_attr_lock);
cfs_list_del_init(&obj->oo_sa_linkage);
+ cfs_write_unlock(&obj->oo_attr_lock);
}
cfs_up(&oh->ot_sa_lock);
}
* from within sa_object_size() can block on a mutex, so
* we can't call sa_object_size() holding rwlock */
sa_object_size(obj->oo_sa_hdl, &blksize, &blocks);
+ /* we do not control size of indices, so always calculate
+ * it from number of blocks reported by DMU */
+ if (S_ISDIR(attr->la_mode))
+ attr->la_size = 512 * blocks;
/* Block size may be not set; suggest maximal I/O transfers. */
if (blksize == 0)
blksize = 1ULL << SPA_MAXBLOCKSHIFT;
dmu_tx_hold_sa_create(oh->ot_tx, ZFS_SA_BASE_ATTR_SIZE);
+ __osd_xattr_declare_set(env, obj, sizeof(struct lustre_mdt_attrs),
+ XATTR_NAME_LMA, oh);
+
RETURN(osd_declare_quota(env, osd, attr->la_uid, attr->la_gid, 1, oh,
false, NULL, false));
}
/*
* Primitives for directory (i.e. ZAP) handling
*/
+static inline int osd_init_lma(const struct lu_env *env, struct osd_object *obj,
+ const struct lu_fid *fid, struct osd_thandle *oh)
+{
+ struct osd_thread_info *info = osd_oti_get(env);
+ struct lustre_mdt_attrs *lma = &info->oti_mdt_attrs;
+ struct lu_buf buf;
+ int rc;
+
+ lustre_lma_init(lma, fid);
+ lustre_lma_swab(lma);
+ buf.lb_buf = lma;
+ buf.lb_len = sizeof(*lma);
+
+ rc = osd_xattr_set_internal(env, obj, &buf, XATTR_NAME_LMA,
+ LU_XATTR_CREATE, oh, BYPASS_CAPA);
+
+ return rc;
+}
/*
* Concurrency: @dt is write locked.
LASSERT(ergo(rc == 0, dt_object_exists(dt)));
LASSERT(osd_invariant(obj));
+ rc = osd_init_lma(env, obj, fid, oh);
+ if (rc) {
+ CERROR("%s: can not set LMA on "DFID": rc = %d\n",
+ osd->od_svname, PFID(fid), rc);
+ /* ignore errors during LMA initialization */
+ rc = 0;
+ }
+
out:
cfs_up(&obj->oo_guard);
RETURN(rc);
return rc;
}
-static int
+int
__osd_xattr_set(const struct lu_env *env, struct osd_object *obj,
const struct lu_buf *buf, const char *name, int fl,
struct osd_thandle *oh)
}
int osd_xattr_set(const struct lu_env *env, struct dt_object *dt,
- const struct lu_buf *buf, const char *name, int fl,
- struct thandle *handle, struct lustre_capa *capa)
+ const struct lu_buf *buf, const char *name, int fl,
+ struct thandle *handle, struct lustre_capa *capa)
{
struct osd_object *obj = osd_dt_obj(dt);
struct osd_thandle *oh;
cfs_down(&obj->oo_guard);
CDEBUG(D_INODE, "Setting xattr %s with size %d\n",
name, (int)buf->lb_len);
- rc = __osd_sa_xattr_set(env, obj, buf, name, fl, oh);
- /* place xattr in dnode if SA is full */
- if (rc == -EFBIG)
- rc = __osd_xattr_set(env, obj, buf, name, fl, oh);
+ rc = osd_xattr_set_internal(env, obj, buf, name, fl, oh, capa);
cfs_up(&obj->oo_guard);
RETURN(rc);
OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_RECEIVE)) {
int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
int len = desc->bd_iov[i].kiov_len;
+ struct page *np = cfs_alloc_page(CFS_ALLOC_STD);
char *ptr = kmap(desc->bd_iov[i].kiov_page) + off;
- memcpy(ptr, "bad3", min(4, len));
- kunmap(desc->bd_iov[i].kiov_page);
+
+ if (np) {
+ char *ptr2 = kmap(np) + off;
+
+ memcpy(ptr2, ptr, len);
+ memcpy(ptr2, "bad3", min(4, len));
+ kunmap(np);
+ cfs_page_unpin(desc->bd_iov[i].kiov_page);
+ desc->bd_iov[i].kiov_page = np;
+ } else {
+ CERROR("can't alloc page for corruption\n");
+ }
}
cfs_crypto_hash_update_page(hdesc, desc->bd_iov[i].kiov_page,
desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK,
OBD_FAIL_CHECK(OBD_FAIL_OST_CHECKSUM_SEND)) {
int off = desc->bd_iov[i].kiov_offset & ~CFS_PAGE_MASK;
int len = desc->bd_iov[i].kiov_len;
+ struct page *np = cfs_alloc_page(CFS_ALLOC_STD);
char *ptr = kmap(desc->bd_iov[i].kiov_page) + off;
- memcpy(ptr, "bad4", min(4, len));
- kunmap(desc->bd_iov[i].kiov_page);
- /* nobody should use corrupted page again */
- ClearPageUptodate(desc->bd_iov[i].kiov_page);
+
+ if (np) {
+ char *ptr2 = kmap(np) + off;
+
+ memcpy(ptr2, ptr, len);
+ memcpy(ptr2, "bad4", min(4, len));
+ kunmap(np);
+ cfs_page_unpin(desc->bd_iov[i].kiov_page);
+ desc->bd_iov[i].kiov_page = np;
+ } else {
+ CERROR("can't alloc page for corruption\n");
+ }
}
}
ALWAYS_EXCEPT="$ALWAYS_EXCEPT $CONFIG_EXCEPTIONS"
fi
+# LU-2059
+ALWAYS_EXCEPT="$ALWAYS_EXCEPT 5d 19b 21b 27a"
+
+
SRCDIR=`dirname $0`
PATH=$PWD/$SRCDIR:$SRCDIR:$SRCDIR/../utils:$PATH
#
test_17() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
setup
check_mount || return 41
cleanup || return $?
}
test_32a() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
client_only && skip "client only testing" && return 0
[ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
[ -z "$TUNEFS" ] && skip_env "No tunefs" && return 0
run_test 32a "Upgrade from 1.8 (not live)"
test_32b() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
client_only && skip "client only testing" && return 0
[ "$NETTYPE" = "tcp" ] || { skip "NETTYPE != tcp" && return 0; }
[ -z "$TUNEFS" ] && skip_env "No tunefs" && return
run_test 37 "verify set tunables works for symlink device"
test_38() { # bug 14222
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
setup
# like runtests
COUNT=10
}
test_52() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
start_mds
[ $? -eq 0 ] || { error "Unable to start MDS"; return 1; }
start_ost
run_test 53b "check MDT thread count params"
test_54a() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
do_rpc_nodes $(facet_host ost1) run_llverdev $(ostdevname 1) -p
[ $? -eq 0 ] || error "llverdev failed!"
reformat_and_config
run_test 54a "test llverdev and partial verify of device"
test_54b() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
setup
run_llverfs $MOUNT -p
[ $? -eq 0 ] || error "llverfs failed!"
}
test_55() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
local mdsdev=$(mdsdevname 1)
local mdsvdev=$(mdsvdevname 1)
}
test_58() { # bug 22658
- if [ $(facet_fstype mds) == zfs ]; then
- skip "Does not work with ZFS-based MDTs yet"
+ if [ $(facet_fstype mds) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
return
fi
setup_noconfig
run_test 61 "large xattr"
test_62() {
+ if [ $(facet_fstype $SINGLEMDS) != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
# MRP-118
local mdsdev=$(mdsdevname 1)
local ostdev=$(ostdevname 1)
test_57a() {
# note test will not do anything if MDS is not local
+ if [ "$(facet_type_fstype MDS)" != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
remote_mds_nodsh && skip "remote MDS with nodsh" && return
local MNTDEV="osd*.*MDT*.mntdev"
DEV=$(do_facet $SINGLEMDS lctl get_param -n $MNTDEV)
run_test 57a "verify MDS filesystem created with large inodes =="
test_57b() {
+ if [ "$(facet_type_fstype MDS)" != ldiskfs ]; then
+ skip "Only applicable to ldiskfs-based MDTs"
+ return
+ fi
+
remote_mds_nodsh && skip "remote MDS with nodsh" && return
local dir=$DIR/d57b
grep -q -w jbd2 $SYMLIST || { modprobe jbd2 2>/dev/null || true; }
[ "$LQUOTA" != "no" ] && load_module quota/lquota $LQUOTAOPTS
if [[ $(node_fstypes $HOSTNAME) == *zfs* ]]; then
+ modprobe zfs
load_module osd-zfs/osd_zfs
fi
load_module mgs/mgs