#include <ldiskfs/ldiskfs.h>
#include <ldiskfs/xattr.h>
+#include <ldiskfs/ldiskfs_extents.h>
#undef ENTRY
/*
* struct OBD_{ALLOC,FREE}*()
#include <lustre_linkea.h>
int ldiskfs_pdo = 1;
-CFS_MODULE_PARM(ldiskfs_pdo, "i", int, 0644,
- "ldiskfs with parallel directory operations");
+module_param(ldiskfs_pdo, int, 0644);
+MODULE_PARM_DESC(ldiskfs_pdo, "ldiskfs with parallel directory operations");
int ldiskfs_track_declares_assert;
-CFS_MODULE_PARM(ldiskfs_track_declares_assert, "i", int, 0644,
- "LBUG during tracking of declares");
+module_param(ldiskfs_track_declares_assert, int, 0644);
+MODULE_PARM_DESC(ldiskfs_track_declares_assert, "LBUG during tracking of declares");
/* Slab to allocate dynlocks */
struct kmem_cache *dynlock_cachep;
struct osd_scrub *scrub;
struct scrub_file *sf;
int result;
- int saved = 0;
- bool cached = true;
- bool triggered = false;
+ int rc1 = 0;
+ bool cached = true;
+ bool remote = false;
ENTRY;
LINVRNT(osd_invariant(obj));
if (result == -EREMCHG) {
trigger:
- if (unlikely(triggered))
- GOTO(out, result = saved);
-
- triggered = true;
- if (thread_is_running(&scrub->os_thread)) {
- result = -EINPROGRESS;
- } else if (!dev->od_noscrub) {
- result = osd_scrub_start(dev, SS_AUTO_FULL |
- SS_CLEAR_DRYRUN | SS_CLEAR_FAILOUT);
- LCONSOLE_WARN("%.16s: trigger OI scrub by RPC "
- "for "DFID", rc = %d [1]\n",
- osd_name(dev), PFID(fid), result);
- if (result == 0 || result == -EALREADY)
- result = -EINPROGRESS;
- else
- result = -EREMCHG;
- } else {
- result = -EREMCHG;
- }
-
- if (fid_is_on_ost(info, dev, fid, OI_CHECK_FLD))
- GOTO(out, result);
-
/* We still have chance to get the valid inode: for the
* object which is referenced by remote name entry, the
* object on the local MDT will be linked under the dir
* only happened for the RPC from other MDT during the
* OI scrub, or for the client side RPC with FID only,
* such as FID to path, or from old connected client. */
- saved = result;
- result = osd_lookup_in_remote_parent(info, dev,
- fid, id);
- if (result == 0) {
- cached = true;
- goto iget;
+ if (!remote &&
+ !fid_is_on_ost(info, dev, fid, OI_CHECK_FLD)) {
+ rc1 = osd_lookup_in_remote_parent(info, dev,
+ fid, id);
+ if (rc1 == 0) {
+ remote = true;
+ cached = true;
+ goto iget;
+ }
}
- result = saved;
+ if (thread_is_running(&scrub->os_thread)) {
+ if (remote) {
+ osd_add_oi_cache(info, dev, id, fid);
+ osd_oii_insert(dev, oic, true);
+ } else {
+ result = -EINPROGRESS;
+ }
+ } else if (!dev->od_noscrub) {
+ __u32 flags = SS_CLEAR_DRYRUN |
+ SS_CLEAR_FAILOUT;
+
+ flags |= (remote ? SS_AUTO_PARTIAL :
+ SS_AUTO_FULL);
+ rc1 = osd_scrub_start(dev, flags);
+ LCONSOLE_WARN("%.16s: trigger OI scrub by RPC "
+ "for the "DFID" with flags 0x%x,"
+ " rc = %d\n", osd_name(dev),
+ PFID(fid), flags, rc1);
+ if (rc1 == 0 || rc1 == -EALREADY) {
+ result = -EINPROGRESS;
+ if (remote) {
+ osd_add_oi_cache(info, dev, id,
+ fid);
+ osd_oii_insert(dev, oic, true);
+ }
+ } else {
+ result = -EREMCHG;
+ }
+ } else {
+ result = -EREMCHG;
+ }
}
GOTO(out, result);
+ } else if (remote) {
+ result = 0;
+ goto trigger;
}
obj->oo_inode = inode;
OBD_FREE_PTR(oh);
}
+#ifndef HAVE_SB_START_WRITE
+# define sb_start_write(sb) do {} while (0)
+# define sb_end_write(sb) do {} while (0)
+#endif
+
static struct thandle *osd_trans_create(const struct lu_env *env,
struct dt_device *d)
{
/* on pending IO in this thread should left from prev. request */
LASSERT(atomic_read(&iobuf->dr_numreqs) == 0);
- th = ERR_PTR(-ENOMEM);
+ sb_start_write(osd_sb(osd_dt_dev(d)));
+
OBD_ALLOC_GFP(oh, sizeof *oh, GFP_NOFS);
if (oh != NULL) {
oh->ot_quota_trans = &oti->oti_quota_trans;
sizeof(oti->oti_declare_ops_cred));
memset(oti->oti_declare_ops_used, 0,
sizeof(oti->oti_declare_ops_used));
+ } else {
+ sb_end_write(osd_sb(osd_dt_dev(d)));
+ th = ERR_PTR(-ENOMEM);
}
RETURN(th);
}
static unsigned long last_printed;
static int last_credits;
- CWARN("%.16s: too many transaction credits (%d > %d)\n",
- LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
- oh->ot_credits,
- osd_journal(dev)->j_max_transaction_buffers);
-
- osd_trans_dump_creds(env, th);
-
+ /* don't make noise on a tiny testing systems
+ * actual credits misuse will be caught anyway */
if (last_credits != oh->ot_credits &&
time_after(jiffies, last_printed +
- msecs_to_jiffies(60 * MSEC_PER_SEC))) {
+ msecs_to_jiffies(60 * MSEC_PER_SEC)) &&
+ osd_transaction_size(dev) > 512) {
+ osd_trans_dump_creds(env, th);
libcfs_debug_dumpstack(NULL);
last_credits = oh->ot_credits;
last_printed = jiffies;
if (unlikely(remove_agents != 0))
osd_process_scheduled_agent_removals(env, osd);
+ sb_end_write(osd_sb(osd));
+
RETURN(rc);
}
static void osd_object_release(const struct lu_env *env,
struct lu_object *l)
{
+ struct osd_object *o = osd_obj(l);
+ /* nobody should be releasing a non-destroyed object with nlink=0
+ * the API allows this, but ldiskfs doesn't like and then report
+ * this inode as deleted */
+ if (unlikely(!o->oo_destroyed && o->oo_inode && o->oo_inode->i_nlink == 0))
+ LBUG();
}
/*
d ? d->id_ops->id_name : "plain");
}
-#define GRANT_FOR_LOCAL_OIDS 32 /* 128kB for last_rcvd, quota files, ... */
-
/*
* Concurrency: shouldn't matter.
*/
int osd_statfs(const struct lu_env *env, struct dt_device *d,
struct obd_statfs *sfs)
{
- struct osd_device *osd = osd_dt_dev(d);
- struct super_block *sb = osd_sb(osd);
- struct kstatfs *ksfs;
- int result = 0;
+ struct osd_device *osd = osd_dt_dev(d);
+ struct super_block *sb = osd_sb(osd);
+ struct kstatfs *ksfs;
+ __u64 reserved;
+ int result = 0;
if (unlikely(osd->od_mnt == NULL))
return -EINPROGRESS;
ksfs = &osd_oti_get(env)->oti_ksfs;
}
- spin_lock(&osd->od_osfs_lock);
result = sb->s_op->statfs(sb->s_root, ksfs);
- if (likely(result == 0)) { /* N.B. statfs can't really fail */
- statfs_pack(sfs, ksfs);
- if (unlikely(sb->s_flags & MS_RDONLY))
- sfs->os_state = OS_STATE_READONLY;
- if (LDISKFS_HAS_INCOMPAT_FEATURE(sb,
- LDISKFS_FEATURE_INCOMPAT_EXTENTS))
- sfs->os_maxbytes = sb->s_maxbytes;
- else
- sfs->os_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes;
- }
- spin_unlock(&osd->od_osfs_lock);
+ if (result)
+ goto out;
+
+ statfs_pack(sfs, ksfs);
+ if (unlikely(sb->s_flags & MS_RDONLY))
+ sfs->os_state = OS_STATE_READONLY;
+ if (LDISKFS_HAS_INCOMPAT_FEATURE(sb,
+ LDISKFS_FEATURE_INCOMPAT_EXTENTS))
+ sfs->os_maxbytes = sb->s_maxbytes;
+ else
+ sfs->os_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes;
- if (unlikely(env == NULL))
- OBD_FREE_PTR(ksfs);
+ /*
+ * Reserve some space so to avoid fragmenting the filesystem too much.
+ * Fragmentation not only impacts performance, but can also increase
+ * metadata overhead significantly, causing grant calculation to be
+ * wrong.
+ *
+ * Reserve 0.78% of total space, at least 8MB for small filesystems.
+ */
+ CLASSERT(OSD_STATFS_RESERVED > LDISKFS_MAX_BLOCK_SIZE);
+ reserved = OSD_STATFS_RESERVED >> sb->s_blocksize_bits;
+ if (likely(sfs->os_blocks >= reserved << OSD_STATFS_RESERVED_SHIFT))
+ reserved = sfs->os_blocks >> OSD_STATFS_RESERVED_SHIFT;
- /* Reserve a small amount of space for local objects like last_rcvd,
- * llog, quota files, ... */
- if (sfs->os_bavail <= GRANT_FOR_LOCAL_OIDS) {
- sfs->os_bavail = 0;
- } else {
- sfs->os_bavail -= GRANT_FOR_LOCAL_OIDS;
- /** Take out metadata overhead for indirect blocks */
- sfs->os_bavail -= sfs->os_bavail >> (sb->s_blocksize_bits - 3);
- }
+ sfs->os_blocks -= reserved;
+ sfs->os_bfree -= min(reserved, sfs->os_bfree);
+ sfs->os_bavail -= min(reserved, sfs->os_bavail);
- return result;
+out:
+ if (unlikely(env == NULL))
+ OBD_FREE_PTR(ksfs);
+ return result;
}
/**
*/
param->ddp_max_name_len = LDISKFS_NAME_LEN;
param->ddp_max_nlink = LDISKFS_LINK_MAX;
- param->ddp_block_shift = sb->s_blocksize_bits;
+ param->ddp_symlink_max = sb->s_blocksize;
param->ddp_mount_type = LDD_MT_LDISKFS;
if (LDISKFS_HAS_INCOMPAT_FEATURE(sb, LDISKFS_FEATURE_INCOMPAT_EXTENTS))
param->ddp_maxbytes = sb->s_maxbytes;
else
param->ddp_maxbytes = LDISKFS_SB(sb)->s_bitmap_maxbytes;
- /* Overhead estimate should be fairly accurate, so we really take a tiny
- * error margin which also avoids fragmenting the filesystem too much */
- param->ddp_grant_reserved = 2; /* end up to be 1.9% after conversion */
/* inode are statically allocated, so per-inode space consumption
* is the space consumed by the directory entry */
param->ddp_inodespace = PER_OBJ_USAGE;
- /* per-fragment overhead to be used by the client code */
- param->ddp_grant_frag = 6 * LDISKFS_BLOCK_SIZE(sb);
- param->ddp_mntopts = 0;
+ /* EXT_INIT_MAX_LEN is the theoretical maximum extent size (32k blocks
+ * = 128MB) which is unlikely to be hit in real life. Report a smaller
+ * maximum length to not under count the actual number of extents
+ * needed for writing a file. */
+ param->ddp_max_extent_blks = EXT_INIT_MAX_LEN >> 2;
+ /* worst-case extent insertion metadata overhead */
+ param->ddp_extent_tax = 6 * LDISKFS_BLOCK_SIZE(sb);
+ param->ddp_mntopts = 0;
if (test_opt(sb, XATTR_USER))
param->ddp_mntopts |= MNTOPT_USERXATTR;
if (test_opt(sb, POSIX_ACL))
#ifdef HAVE_DEV_SET_RDONLY
CERROR("*** setting %s read-only ***\n", osd_dt_dev(d)->od_svname);
+ if (sb->s_op->freeze_fs) {
+ rc = sb->s_op->freeze_fs(sb);
+ if (rc)
+ goto out;
+ }
+
if (jdev && (jdev != dev)) {
CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n",
(long)jdev);
}
CDEBUG(D_IOCTL | D_HA, "set dev %lx rdonly\n", (long)dev);
dev_set_rdonly(dev);
-#else
- CERROR("%s: %lx CANNOT BE SET READONLY: rc = %d\n",
- osd_dt_dev(d)->od_svname, (long)dev, rc);
+
+ if (sb->s_op->unfreeze_fs)
+ sb->s_op->unfreeze_fs(sb);
+
+out:
#endif
+ if (rc)
+ CERROR("%s: %lx CANNOT BE SET READONLY: rc = %d\n",
+ osd_dt_dev(d)->od_svname, (long)dev, rc);
+
RETURN(rc);
}
osd_trans_exec_op(env, th, OSD_OT_INSERT);
osd_id_gen(id, obj->oo_inode->i_ino, obj->oo_inode->i_generation);
- rc = osd_oi_insert(info, osd, fid, id, oh->ot_handle, OI_CHECK_FLD);
+ rc = osd_oi_insert(info, osd, fid, id, oh->ot_handle,
+ OI_CHECK_FLD, NULL);
osd_trans_exec_check(env, th, OSD_OT_INSERT);
return rc;
struct osd_thandle *oh;
int rc = 0;
- if (!dt_object_exists(dt))
+ if (!dt_object_exists(dt) || obj->oo_destroyed)
return -ENOENT;
LINVRNT(osd_invariant(obj));
* \retval 0 on success
* \retval 1 on buffer full
*/
+#ifdef HAVE_FILLDIR_USE_CTX
+static int osd_ldiskfs_filldir(struct dir_context *buf,
+ const char *name, int namelen,
+#else
static int osd_ldiskfs_filldir(void *buf, const char *name, int namelen,
+#endif
loff_t offset, __u64 ino,
unsigned d_type)
{
- struct osd_it_ea *it = ((struct osd_filldir_cbs *)buf)->it;
+ struct osd_it_ea *it =
+ ((struct osd_filldir_cbs *)buf)->it;
struct osd_object *obj = it->oie_obj;
struct osd_it_ea_dirent *ent = it->oie_dirent;
struct lu_fid *fid = &ent->oied_fid;
struct osd_thread_info *info = osd_oti_get(env);
struct lu_fid *fid = &info->oti_fid;
struct inode *inode;
- int rc = 0, force_over_128tb = 0;
+ int rc = 0, force_over_256tb = 0;
ENTRY;
if (o->od_mnt != NULL)
RETURN(-EINVAL);
}
#endif
- if (opts != NULL && strstr(opts, "force_over_128tb") != NULL)
- force_over_128tb = 1;
+ if (opts != NULL && strstr(opts, "force_over_128tb") != NULL) {
+ CWARN("force_over_128tb option is depricated."
+ "Filesystems less then 256TB can be created without any"
+ "force options. Use force_over_256tb option for"
+ "filesystems greather then 256TB.\n");
+ }
+
+ if (opts != NULL && strstr(opts, "force_over_256tb") != NULL)
+ force_over_256tb = 1;
- __page = alloc_page(GFP_IOFS);
+ __page = alloc_page(GFP_KERNEL);
if (__page == NULL)
GOTO(out, rc = -ENOMEM);
page = (unsigned long)page_address(__page);
"noextents",
/* strip out option we processed in osd */
"bigendian_extents",
- "force_over_128tb",
+#if LUSTRE_VERSION_CODE >= OBD_OCD_VERSION(3,0,53,0)
+#warning "remove force_over_128 option"
+#else
+ "force_over_128tb (deprecated)",
+#endif
+ "force_over_256tb",
NULL
};
strcat(options, opts);
GOTO(out, rc);
}
- if (ldiskfs_blocks_count(LDISKFS_SB(osd_sb(o))->s_es) > (8ULL << 32) &&
- force_over_128tb == 0) {
+ if (ldiskfs_blocks_count(LDISKFS_SB(osd_sb(o))->s_es) > (64ULL << 30) &&
+ force_over_256tb == 0) {
CERROR("%s: device %s LDISKFS does not support filesystems "
- "greater than 128TB and can cause data corruption. "
- "Use \"force_over_128tb\" mount option to override.\n",
+ "greater than 256TB and can cause data corruption. "
+ "Use \"force_over_256tb\" mount option to override.\n",
name, dev);
GOTO(out, rc = -EINVAL);
}