init_rwsem(&mo->oo_sem);
init_rwsem(&mo->oo_ext_idx_sem);
spin_lock_init(&mo->oo_guard);
+ INIT_LIST_HEAD(&mo->oo_xattr_list);
return l;
} else {
return NULL;
struct osd_device *dev;
struct osd_idmap_cache *oic;
struct osd_inode_id *id;
- struct inode *inode;
+ struct inode *inode = NULL;
struct osd_scrub *scrub;
struct scrub_file *sf;
- int result;
- int saved = 0;
- bool cached = true;
- bool triggered = false;
+ __u32 flags = SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT |
+ SS_AUTO_FULL;
+ int result = 0;
+ int rc1 = 0;
+ bool cached = true;
+ bool remote = false;
ENTRY;
LINVRNT(osd_invariant(obj));
if (result == -EREMCHG) {
trigger:
- if (unlikely(triggered))
- GOTO(out, result = saved);
-
- triggered = true;
- if (thread_is_running(&scrub->os_thread)) {
- result = -EINPROGRESS;
- } else if (!dev->od_noscrub) {
- result = osd_scrub_start(dev, SS_AUTO_FULL |
- SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT);
- LCONSOLE_WARN("%.16s: trigger OI scrub by RPC "
- "for "DFID", rc = %d [1]\n",
- osd_name(dev), PFID(fid), result);
- if (result == 0 || result == -EALREADY)
- result = -EINPROGRESS;
- else
- result = -EREMCHG;
- } else {
- result = -EREMCHG;
- }
-
- if (fid_is_on_ost(info, dev, fid, OI_CHECK_FLD))
- GOTO(out, result);
-
/* We still have chance to get the valid inode: for the
* object which is referenced by remote name entry, the
* object on the local MDT will be linked under the dir
* only happened for the RPC from other MDT during the
* OI scrub, or for the client side RPC with FID only,
* such as FID to path, or from old connected client. */
- saved = result;
- result = osd_lookup_in_remote_parent(info, dev,
- fid, id);
- if (result == 0) {
- cached = true;
- goto iget;
+ if (!remote &&
+ !fid_is_on_ost(info, dev, fid, OI_CHECK_FLD)) {
+ rc1 = osd_lookup_in_remote_parent(info, dev,
+ fid, id);
+ if (rc1 == 0) {
+ remote = true;
+ cached = true;
+ flags |= SS_AUTO_PARTIAL;
+ flags &= ~SS_AUTO_FULL;
+ goto iget;
+ }
}
- result = saved;
+ if (thread_is_running(&scrub->os_thread)) {
+ if (scrub->os_partial_scan &&
+ !scrub->os_in_join) {
+ goto join;
+ } else {
+ if (inode != NULL && !IS_ERR(inode)) {
+ LASSERT(remote);
+
+ osd_add_oi_cache(info, dev, id,
+ fid);
+ osd_oii_insert(dev, oic, true);
+ } else {
+ result = -EINPROGRESS;
+ }
+ }
+ } else if (!dev->od_noscrub) {
+
+join:
+ rc1 = osd_scrub_start(dev, flags);
+ LCONSOLE_WARN("%.16s: trigger OI scrub by RPC "
+ "for the "DFID" with flags 0x%x,"
+ " rc = %d\n", osd_name(dev),
+ PFID(fid), flags, rc1);
+ if (rc1 == 0 || rc1 == -EALREADY) {
+ if (inode != NULL && !IS_ERR(inode)) {
+ LASSERT(remote);
+
+ osd_add_oi_cache(info, dev, id,
+ fid);
+ osd_oii_insert(dev, oic, true);
+ } else {
+ result = -EINPROGRESS;
+ }
+ } else {
+ result = -EREMCHG;
+ }
+ } else {
+ result = -EREMCHG;
+ }
}
- GOTO(out, result);
+ if (inode == NULL || IS_ERR(inode))
+ GOTO(out, result);
+ } else if (remote) {
+ goto trigger;
}
obj->oo_inode = inode;
}
iput(inode);
+ inode = NULL;
obj->oo_inode = NULL;
if (result != -EREMCHG)
GOTO(out, result);
return result;
}
+/* The first part of oxe_buf is xattr name, and is '\0' terminated.
+ * The left part is for value, binary mode. */
+struct osd_xattr_entry {
+ struct list_head oxe_list;
+ size_t oxe_len;
+ size_t oxe_namelen;
+ bool oxe_exist;
+ struct rcu_head oxe_rcu;
+ char oxe_buf[0];
+};
+
+static struct osd_xattr_entry *osd_oxc_lookup(struct osd_object *obj,
+ const char *name,
+ size_t namelen)
+{
+ struct osd_xattr_entry *oxe;
+
+ list_for_each_entry(oxe, &obj->oo_xattr_list, oxe_list) {
+ if (namelen == oxe->oxe_namelen &&
+ strncmp(name, oxe->oxe_buf, namelen) == 0)
+ return oxe;
+ }
+
+ return NULL;
+}
+
+static int osd_oxc_get(struct osd_object *obj, const char *name,
+ struct lu_buf *buf)
+{
+ struct osd_xattr_entry *oxe;
+ size_t vallen;
+ ENTRY;
+
+ rcu_read_lock();
+ oxe = osd_oxc_lookup(obj, name, strlen(name));
+ if (oxe == NULL) {
+ rcu_read_unlock();
+ RETURN(-ENOENT);
+ }
+
+ if (!oxe->oxe_exist) {
+ rcu_read_unlock();
+ RETURN(-ENODATA);
+ }
+
+ vallen = oxe->oxe_len - sizeof(*oxe) - oxe->oxe_namelen - 1;
+ LASSERT(vallen > 0);
+
+ if (buf->lb_buf == NULL) {
+ rcu_read_unlock();
+ RETURN(vallen);
+ }
+
+ if (buf->lb_len < vallen) {
+ rcu_read_unlock();
+ RETURN(-ERANGE);
+ }
+
+ memcpy(buf->lb_buf, oxe->oxe_buf + oxe->oxe_namelen + 1, vallen);
+ rcu_read_unlock();
+
+ RETURN(vallen);
+}
+
+static void osd_oxc_free(struct rcu_head *head)
+{
+ struct osd_xattr_entry *oxe;
+
+ oxe = container_of(head, struct osd_xattr_entry, oxe_rcu);
+ OBD_FREE(oxe, oxe->oxe_len);
+}
+
+static inline void __osd_oxc_del(struct osd_object *obj, const char *name)
+{
+ struct osd_xattr_entry *oxe;
+
+ oxe = osd_oxc_lookup(obj, name, strlen(name));
+ if (oxe != NULL) {
+ list_del(&oxe->oxe_list);
+ call_rcu(&oxe->oxe_rcu, osd_oxc_free);
+ }
+}
+
+static void osd_oxc_add(struct osd_object *obj, const char *name,
+ const char *buf, int buflen)
+{
+ struct osd_xattr_entry *oxe;
+ size_t namelen = strlen(name);
+ size_t len = sizeof(*oxe) + namelen + 1 + buflen;
+
+ OBD_ALLOC(oxe, len);
+ if (oxe == NULL)
+ return;
+
+ INIT_LIST_HEAD(&oxe->oxe_list);
+ oxe->oxe_len = len;
+ oxe->oxe_namelen = namelen;
+ memcpy(oxe->oxe_buf, name, namelen);
+ if (buflen > 0) {
+ LASSERT(buf != NULL);
+ memcpy(oxe->oxe_buf + namelen + 1, buf, buflen);
+ oxe->oxe_exist = true;
+ } else {
+ oxe->oxe_exist = false;
+ }
+
+ /* this should be rarely called, just remove old and add new */
+ spin_lock(&obj->oo_guard);
+ __osd_oxc_del(obj, name);
+ list_add_tail(&oxe->oxe_list, &obj->oo_xattr_list);
+ spin_unlock(&obj->oo_guard);
+}
+
+static void osd_oxc_del(struct osd_object *obj, const char *name)
+{
+ spin_lock(&obj->oo_guard);
+ __osd_oxc_del(obj, name);
+ spin_unlock(&obj->oo_guard);
+}
+
+static void osd_oxc_fini(struct osd_object *obj)
+{
+ struct osd_xattr_entry *oxe, *next;
+
+ list_for_each_entry_safe(oxe, next, &obj->oo_xattr_list, oxe_list) {
+ list_del(&oxe->oxe_list);
+ OBD_FREE(oxe, oxe->oxe_len);
+ }
+}
+
/*
* Concurrency: no concurrent access is possible that late in object
* life-cycle.
LINVRNT(osd_invariant(obj));
+ osd_oxc_fini(obj);
dt_object_fini(&obj->oo_dt);
if (obj->oo_hl_head != NULL)
ldiskfs_htree_lock_head_free(obj->oo_hl_head);
OBD_FREE_PTR(oh);
}
+#ifndef HAVE_SB_START_WRITE
+# define sb_start_write(sb) do {} while (0)
+# define sb_end_write(sb) do {} while (0)
+#endif
+
static struct thandle *osd_trans_create(const struct lu_env *env,
struct dt_device *d)
{
/* on pending IO in this thread should left from prev. request */
LASSERT(atomic_read(&iobuf->dr_numreqs) == 0);
- th = ERR_PTR(-ENOMEM);
+ sb_start_write(osd_sb(osd_dt_dev(d)));
+
OBD_ALLOC_GFP(oh, sizeof *oh, GFP_NOFS);
if (oh != NULL) {
oh->ot_quota_trans = &oti->oti_quota_trans;
sizeof(oti->oti_declare_ops_cred));
memset(oti->oti_declare_ops_used, 0,
sizeof(oti->oti_declare_ops_used));
+ } else {
+ sb_end_write(osd_sb(osd_dt_dev(d)));
+ th = ERR_PTR(-ENOMEM);
}
RETURN(th);
}
if (unlikely(remove_agents != 0))
osd_process_scheduled_agent_removals(env, osd);
+ sb_end_write(osd_sb(osd));
+
RETURN(rc);
}
#ifdef HAVE_DEV_SET_RDONLY
CERROR("*** setting %s read-only ***\n", osd_dt_dev(d)->od_svname);
+ if (sb->s_op->freeze_fs) {
+ rc = sb->s_op->freeze_fs(sb);
+ if (rc)
+ goto out;
+ }
+
if (jdev && (jdev != dev)) {
CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
(long)jdev);
}
CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
dev_set_rdonly(dev);
-#else
- CERROR("%s: %lx CANNOT BE SET READONLY: rc = %d\n",
- osd_dt_dev(d)->od_svname, (long)dev, rc);
+
+ if (sb->s_op->unfreeze_fs)
+ sb->s_op->unfreeze_fs(sb);
+
+out:
#endif
+ if (rc)
+ CERROR("%s: %lx CANNOT BE SET READONLY: rc = %d\n",
+ osd_dt_dev(d)->od_svname, (long)dev, rc);
+
RETURN(rc);
}
if (bits == 0)
return 0;
- if (bits & LA_ATIME)
- inode->i_atime = *osd_inode_time(env, inode, attr->la_atime);
- if (bits & LA_CTIME)
- inode->i_ctime = *osd_inode_time(env, inode, attr->la_ctime);
- if (bits & LA_MTIME)
- inode->i_mtime = *osd_inode_time(env, inode, attr->la_mtime);
- if (bits & LA_SIZE) {
- LDISKFS_I(inode)->i_disksize = attr->la_size;
- i_size_write(inode, attr->la_size);
- }
+ if (bits & LA_ATIME)
+ inode->i_atime = *osd_inode_time(env, inode, attr->la_atime);
+ if (bits & LA_CTIME)
+ inode->i_ctime = *osd_inode_time(env, inode, attr->la_ctime);
+ if (bits & LA_MTIME)
+ inode->i_mtime = *osd_inode_time(env, inode, attr->la_mtime);
+ if (bits & LA_SIZE) {
+ LDISKFS_I(inode)->i_disksize = attr->la_size;
+ i_size_write(inode, attr->la_size);
+ }
-#if 0
- /* OSD should not change "i_blocks" which is used by quota.
- * "i_blocks" should be changed by ldiskfs only. */
- if (bits & LA_BLOCKS)
- inode->i_blocks = attr->la_blocks;
-#endif
+ /* OSD should not change "i_blocks" which is used by quota.
+ * "i_blocks" should be changed by ldiskfs only. */
if (bits & LA_MODE)
inode->i_mode = (inode->i_mode & S_IFMT) |
(attr->la_mode & ~S_IFMT);
if (bits & LA_RDEV)
inode->i_rdev = attr->la_rdev;
- if (bits & LA_FLAGS) {
- /* always keep S_NOCMTIME */
- inode->i_flags = ll_ext_to_inode_flags(attr->la_flags) |
- S_NOCMTIME;
- }
- return 0;
+ if (bits & LA_FLAGS) {
+ /* always keep S_NOCMTIME */
+ inode->i_flags = ll_ext_to_inode_flags(attr->la_flags) |
+ S_NOCMTIME;
+ }
+ return 0;
}
static int osd_quota_transfer(struct inode *inode, const struct lu_attr *attr)
osd_trans_exec_op(env, th, OSD_OT_INSERT);
osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
- rc = osd_oi_insert(info, osd, fid, id, oh->ot_handle, OI_CHECK_FLD);
+ rc = osd_oi_insert(info, osd, fid, id, oh->ot_handle,
+ OI_CHECK_FLD, NULL);
osd_trans_exec_check(env, th, OSD_OT_INSERT);
return rc;
static int osd_xattr_get(const struct lu_env *env, struct dt_object *dt,
struct lu_buf *buf, const char *name)
{
- struct osd_object *obj = osd_dt_obj(dt);
- struct inode *inode = obj->oo_inode;
- struct osd_thread_info *info = osd_oti_get(env);
- struct dentry *dentry = &info->oti_obj_dentry;
+ struct osd_object *obj = osd_dt_obj(dt);
+ struct inode *inode = obj->oo_inode;
+ struct osd_thread_info *info = osd_oti_get(env);
+ struct dentry *dentry = &info->oti_obj_dentry;
+ bool cache_xattr = false;
+ int rc;
- /* version get is not real XATTR but uses xattr API */
- if (strcmp(name, XATTR_NAME_VERSION) == 0) {
- /* for version we are just using xattr API but change inode
- * field instead */
+ /* version get is not real XATTR but uses xattr API */
+ if (strcmp(name, XATTR_NAME_VERSION) == 0) {
+ /* for version we are just using xattr API but change inode
+ * field instead */
if (buf->lb_len == 0)
return sizeof(dt_obj_version_t);
osd_object_version_get(env, dt, buf->lb_buf);
return sizeof(dt_obj_version_t);
- }
+ }
if (!dt_object_exists(dt))
return -ENOENT;
LASSERT(inode->i_op != NULL);
LASSERT(inode->i_op->getxattr != NULL);
- return __osd_xattr_get(inode, dentry, name, buf->lb_buf, buf->lb_len);
-}
+ if (strcmp(name, XATTR_NAME_LOV) == 0 ||
+ strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0)
+ cache_xattr = true;
+
+ if (cache_xattr) {
+ rc = osd_oxc_get(obj, name, buf);
+ if (rc != -ENOENT)
+ return rc;
+ }
+ rc = __osd_xattr_get(inode, dentry, name, buf->lb_buf, buf->lb_len);
+ if (cache_xattr) {
+ if (rc == -ENOENT || rc == -ENODATA)
+ osd_oxc_add(obj, name, NULL, 0);
+ else if (rc > 0 && buf->lb_buf != NULL)
+ osd_oxc_add(obj, name, buf->lb_buf, rc);
+ }
+
+ return rc;
+}
static int osd_declare_xattr_set(const struct lu_env *env,
struct dt_object *dt,
int rc;
ENTRY;
- LASSERT(handle != NULL);
+ LASSERT(handle != NULL);
- /* version set is not real XATTR */
- if (strcmp(name, XATTR_NAME_VERSION) == 0) {
- /* for version we are just using xattr API but change inode
- * field instead */
- LASSERT(buf->lb_len == sizeof(dt_obj_version_t));
- osd_object_version_set(env, dt, buf->lb_buf);
- return sizeof(dt_obj_version_t);
- }
+ /* version set is not real XATTR */
+ if (strcmp(name, XATTR_NAME_VERSION) == 0) {
+ /* for version we are just using xattr API but change inode
+ * field instead */
+ LASSERT(buf->lb_len == sizeof(dt_obj_version_t));
+ osd_object_version_set(env, dt, buf->lb_buf);
+ return sizeof(dt_obj_version_t);
+ }
CDEBUG(D_INODE, DFID" set xattr '%s' with size %zu\n",
PFID(lu_object_fid(&dt->do_lu)), name, buf->lb_len);
fs_flags);
osd_trans_exec_check(env, handle, OSD_OT_XATTR_SET);
+ if (rc == 0 &&
+ (strcmp(name, XATTR_NAME_LOV) == 0 ||
+ strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0))
+ osd_oxc_add(obj, name, buf->lb_buf, buf->lb_len);
+
return rc;
}
dentry->d_sb = inode->i_sb;
rc = inode->i_op->removexattr(dentry, name);
osd_trans_exec_check(env, handle, OSD_OT_XATTR_SET);
+
+ if (rc == 0 &&
+ (strcmp(name, XATTR_NAME_LOV) == 0 ||
+ strcmp(name, XATTR_NAME_DEFAULT_LMV) == 0))
+ osd_oxc_del(obj, name);
+
return rc;
}
RETURN(rc);
}
+static int osd_invalidate(const struct lu_env *env, struct dt_object *dt)
+{
+ return 0;
+}
+
/*
* Index operations.
*/
.do_xattr_del = osd_xattr_del,
.do_xattr_list = osd_xattr_list,
.do_object_sync = osd_object_sync,
+ .do_invalidate = osd_invalidate,
};
/**
.do_xattr_del = osd_xattr_del,
.do_xattr_list = osd_xattr_list,
.do_object_sync = osd_object_sync,
+ .do_invalidate = osd_invalidate,
};
static const struct dt_object_operations osd_obj_otable_it_ops = {
}
bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
- if (bh) {
+ if (!IS_ERR(bh)) {
/* If this is not the ".." entry, it might be a remote DNE
* entry and we need to check if the FID is for a remote
* MDT. If the FID is not in the directory entry (e.g.
le32_to_cpu(de->inode));
}
}
- rc = ldiskfs_delete_entry(oh->ot_handle, dir, de, bh);
- brelse(bh);
- } else {
- rc = -ENOENT;
- }
+ rc = ldiskfs_delete_entry(oh->ot_handle, dir, de, bh);
+ brelse(bh);
+ } else {
+ rc = PTR_ERR(bh);
+ }
if (hlock != NULL)
ldiskfs_htree_unlock(hlock);
else
bh = osd_ldiskfs_find_entry(pobj->oo_inode, &child->d_name, &de,
NULL, hlock);
- if (bh != NULL) {
+ if (!IS_ERR(bh)) {
rc1 = ldiskfs_journal_get_write_access(oth->ot_handle,
bh);
if (rc1 == 0) {
LDISKFS_FT_DIR;
ldiskfs_handle_dirty_metadata(oth->ot_handle,
NULL, bh);
- brelse(bh);
}
+ brelse(bh);
}
}
}
bh = osd_ldiskfs_find_entry(dir, &dentry->d_name, &de, NULL, hlock);
- if (bh) {
+ if (!IS_ERR(bh)) {
struct osd_thread_info *oti = osd_oti_get(env);
struct osd_inode_id *id = &oti->oti_id;
struct osd_idmap_cache *oic = &oti->oti_cache;
if (rc != 0)
fid_zero(&oic->oic_fid);
} else {
- rc = -ENOENT;
+ rc = PTR_ERR(bh);
}
GOTO(out, rc);
* For the whole directory, only dot/dotdot entry have no FID-in-dirent
* and needs to get FID from LMA when readdir, it will not affect the
* performance much. */
- if ((bh == NULL) || (le32_to_cpu(de->inode) != inode->i_ino) ||
+ if (IS_ERR(bh) || (le32_to_cpu(de->inode) != inode->i_ino) ||
(dot_dotdot != 0 && !osd_dot_dotdot_has_space(de, dot_dotdot))) {
*attr |= LUDA_IGNORE;
GOTO(out, rc);
out:
- brelse(bh);
+ if (!IS_ERR(bh))
+ brelse(bh);
if (hlock != NULL) {
ldiskfs_htree_unlock(hlock);
} else {
struct osd_thread_info *info = osd_oti_get(env);
struct lu_fid *fid = &info->oti_fid;
struct inode *inode;
- int rc = 0, force_over_128tb = 0;
+ int rc = 0, force_over_256tb = 0;
ENTRY;
if (o->od_mnt != NULL)
RETURN(-EINVAL);
}
#endif
- if (opts != NULL && strstr(opts, "force_over_128tb") != NULL)
- force_over_128tb = 1;
+ if (opts != NULL && strstr(opts, "force_over_128tb") != NULL) {
+ CWARN("force_over_128tb option is depricated."
+ "Filesystems less then 256TB can be created without any"
+ "force options. Use force_over_256tb option for"
+ "filesystems greather then 256TB.\n");
+ }
+
+ if (opts != NULL && strstr(opts, "force_over_256tb") != NULL)
+ force_over_256tb = 1;
- __page = alloc_page(GFP_IOFS);
+ __page = alloc_page(GFP_KERNEL);
if (__page == NULL)
GOTO(out, rc = -ENOMEM);
page = (unsigned long)page_address(__page);
"noextents",
/* strip out option we processed in osd */
"bigendian_extents",
- "force_over_128tb",
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(3,0,53,0)
+#warning "remove force_over_128 option"
+#else
+ "force_over_128tb (deprecated)",
+#endif
+ "force_over_256tb",
NULL
};
strcat(options, opts);
/* Glom up mount options */
if (*options != '\0')
strcat(options, ",");
- strlcat(options, "no_mbcache", PAGE_CACHE_SIZE);
+ strlcat(options, "no_mbcache", PAGE_SIZE);
type = get_fs_type("ldiskfs");
if (!type) {
GOTO(out, rc);
}
- if (ldiskfs_blocks_count(LDISKFS_SB(osd_sb(o))->s_es) > (8ULL << 32) &&
- force_over_128tb == 0) {
+ if (ldiskfs_blocks_count(LDISKFS_SB(osd_sb(o))->s_es) > (64ULL << 30) &&
+ force_over_256tb == 0) {
CERROR("%s: device %s LDISKFS does not support filesystems "
- "greater than 128TB and can cause data corruption. "
- "Use \"force_over_128tb\" mount option to override.\n",
+ "greater than 256TB and can cause data corruption. "
+ "Use \"force_over_256tb\" mount option to override.\n",
name, dev);
GOTO(out, rc = -EINVAL);
}