struct llog_handle *loghandle,
struct thandle *th)
{
-
+ struct llog_thread_info *lgi = llog_info(env);
+ struct llog_logid_rec *rec = &lgi->lgi_logid;
struct llog_log_hdr *llh;
- struct llog_logid_rec rec = { { 0 }, };
int rc, index, bitmap_size;
ENTRY;
DOSTID"\n", POSTID(&loghandle->lgh_id.lgl_oi),
loghandle->lgh_id.lgl_ogen, index,
POSTID(&cathandle->lgh_id.lgl_oi));
- /* build the record for this log in the catalog */
- rec.lid_hdr.lrh_len = sizeof(rec);
- rec.lid_hdr.lrh_index = index;
- rec.lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
- rec.lid_id = loghandle->lgh_id;
- rec.lid_tail.lrt_len = sizeof(rec);
- rec.lid_tail.lrt_index = index;
+ /* build the record for this log in the catalog */
+ rec->lid_hdr.lrh_len = sizeof(*rec);
+ rec->lid_hdr.lrh_index = index;
+ rec->lid_hdr.lrh_type = LLOG_LOGID_MAGIC;
+ rec->lid_id = loghandle->lgh_id;
+ rec->lid_tail.lrt_len = sizeof(*rec);
+ rec->lid_tail.lrt_index = index;
/* update the catalog: header and record */
- rc = llog_write_rec(env, cathandle, &rec.lid_hdr,
+ rc = llog_write_rec(env, cathandle, &rec->lid_hdr,
&loghandle->u.phd.phd_cookie, 1, NULL, index, th);
if (rc < 0)
GOTO(out_destroy, rc);
struct llog_handle *cathandle,
struct llog_rec_hdr *rec, struct thandle *th)
{
+ struct llog_thread_info *lgi = llog_info(env);
+ struct llog_logid_rec *lirec = &lgi->lgi_logid;
struct llog_handle *loghandle, *next;
int rc = 0;
if (rc)
GOTO(out, rc);
+ lirec->lid_hdr.lrh_len = sizeof(*lirec);
+
if (!llog_exist(cathandle->u.chd.chd_current_log)) {
rc = llog_declare_create(env, cathandle->u.chd.chd_current_log,
th);
if (rc)
GOTO(out, rc);
- llog_declare_write_rec(env, cathandle, NULL, -1, th);
+ llog_declare_write_rec(env, cathandle, &lirec->lid_hdr, -1, th);
}
/* declare records in the llogs */
rc = llog_declare_write_rec(env, cathandle->u.chd.chd_current_log,
if (next) {
if (!llog_exist(next)) {
rc = llog_declare_create(env, next, th);
- llog_declare_write_rec(env, cathandle, NULL, -1, th);
+ llog_declare_write_rec(env, cathandle, &lirec->lid_hdr,
+ -1, th);
}
- llog_declare_write_rec(env, next, rec, -1, th);
+ /* XXX: we hope for declarations made for existing llog
+ * this might be not correct with some backends
+ * where declarations are expected against specific
+ * object like ZFS with full debugging enabled */
+ /*llog_declare_write_rec(env, next, rec, -1, th);*/
}
out:
RETURN(rc);
loff_t lgi_off;
struct llog_rec_hdr lgi_lrh;
struct llog_rec_tail lgi_tail;
+ struct llog_logid_rec lgi_logid;
};
extern struct lu_context_key llog_thread_key;
LASSERT(env);
LASSERT(th);
LASSERT(loghandle);
+ LASSERT(rec);
+ LASSERT(rec->lrh_len <= LLOG_CHUNK_SIZE);
o = loghandle->lgh_obj;
LASSERT(o);
lgi->lgi_off = 0;
}
- lgi->lgi_buf.lb_len = 32 * 1024;
+ lgi->lgi_buf.lb_len = rec->lrh_len;
lgi->lgi_buf.lb_buf = NULL;
/* XXX: implement declared window or multi-chunks approach */
- rc = dt_declare_record_write(env, o, &lgi->lgi_buf, lgi->lgi_off, th);
+ rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th);
RETURN(rc);
}
if (rc)
RETURN(rc);
- lgi->lgi_buf.lb_len = LLOG_CHUNK_SIZE;
- lgi->lgi_buf.lb_buf = NULL;
- rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, th);
- if (rc)
- RETURN(rc);
+ /* do not declare header initialization here as it's declared
+ * in llog_osd_declare_write_rec() which is always called */
if (res->lgh_name) {
struct dt_object *llog_dir;
LDISKFS_SB(osd_sb(dev))->s_es->s_volume_name,
oh->ot_credits,
osd_journal(dev)->j_max_transaction_buffers);
- CWARN(" create: %u/%u, delete: %u/%u, destroy: %u/%u\n",
+ CWARN(" create: %u/%u, destroy: %u/%u\n",
oti->oti_declare_ops[OSD_OT_CREATE],
oti->oti_declare_ops_cred[OSD_OT_CREATE],
- oti->oti_declare_ops[OSD_OT_DELETE],
- oti->oti_declare_ops_cred[OSD_OT_DELETE],
oti->oti_declare_ops[OSD_OT_DESTROY],
oti->oti_declare_ops_cred[OSD_OT_DESTROY]);
CWARN(" attr_set: %u/%u, xattr_set: %u/%u\n",
CWARN(" insert: %u/%u, delete: %u/%u\n",
oti->oti_declare_ops[OSD_OT_INSERT],
oti->oti_declare_ops_cred[OSD_OT_INSERT],
- oti->oti_declare_ops[OSD_OT_DESTROY],
- oti->oti_declare_ops_cred[OSD_OT_DESTROY]);
+ oti->oti_declare_ops[OSD_OT_DELETE],
+ oti->oti_declare_ops_cred[OSD_OT_DELETE]);
CWARN(" ref_add: %u/%u, ref_del: %u/%u\n",
oti->oti_declare_ops[OSD_OT_REF_ADD],
oti->oti_declare_ops_cred[OSD_OT_REF_ADD],
* If we mount with --data_journal we may need more.
*/
const int osd_dto_credits_noquota[DTO_NR] = {
- /**
- * Insert/Delete.
- * INDEX_EXTRA_TRANS_BLOCKS(8) +
- * SINGLEDATA_TRANS_BLOCKS(8)
- * XXX Note: maybe iam need more, since iam have more level than
- * EXT3 htree.
- */
- [DTO_INDEX_INSERT] = 16,
- [DTO_INDEX_DELETE] = 16,
- /**
+ /**
+ * Insert.
+ * INDEX_EXTRA_TRANS_BLOCKS(8) +
+ * SINGLEDATA_TRANS_BLOCKS(8)
+ * XXX Note: maybe iam need more, since iam have more level than
+ * EXT3 htree.
+ */
+ [DTO_INDEX_INSERT] = 16,
+ /**
+ * Delete
+ * just modify a single entry, probably merge few within a block
+ */
+ [DTO_INDEX_DELETE] = 1,
+ /**
* Used for OI scrub
- */
- [DTO_INDEX_UPDATE] = 16,
- /**
- * Create a object. The same as create object in EXT3.
- * DATA_TRANS_BLOCKS(14) +
- * INDEX_EXTRA_BLOCKS(8) +
- * 3(inode bits, groups, GDT)
- */
- [DTO_OBJECT_CREATE] = 25,
- /**
- * XXX: real credits to be fixed
- */
- [DTO_OBJECT_DELETE] = 25,
- /**
- * Attr set credits (inode)
- */
- [DTO_ATTR_SET_BASE] = 1,
- /**
- * Xattr set. The same as xattr of EXT3.
- * DATA_TRANS_BLOCKS(14)
- * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS
- * are also counted in. Do not know why?
- */
- [DTO_XATTR_SET] = 14,
- [DTO_LOG_REC] = 14,
- /**
- * credits for inode change during write.
- */
- [DTO_WRITE_BASE] = 3,
- /**
- * credits for single block write.
- */
- [DTO_WRITE_BLOCK] = 14,
- /**
- * Attr set credits for chown.
- * This is extra credits for setattr, and it is null without quota
- */
- [DTO_ATTR_SET_CHOWN]= 0
+ */
+ [DTO_INDEX_UPDATE] = 16,
+ /**
+ * 4(inode, inode bits, groups, GDT)
+ * notice: OI updates are counted separately with DTO_INDEX_INSERT
+ */
+ [DTO_OBJECT_CREATE] = 4,
+ /**
+ * 4(inode, inode bits, groups, GDT)
+ * notice: OI updates are counted separately with DTO_INDEX_DELETE
+ */
+ [DTO_OBJECT_DELETE] = 4,
+ /**
+ * Attr set credits (inode)
+ */
+ [DTO_ATTR_SET_BASE] = 1,
+ /**
+ * Xattr set. The same as xattr of EXT3.
+ * DATA_TRANS_BLOCKS(14)
+ * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS
+ * are also counted in. Do not know why?
+ */
+ [DTO_XATTR_SET] = 14,
+ /**
+ * credits for inode change during write.
+ */
+ [DTO_WRITE_BASE] = 3,
+ /**
+ * credits for single block write.
+ */
+ [DTO_WRITE_BLOCK] = 14,
+ /**
+ * Attr set credits for chown.
+ * This is extra credits for setattr, and it is null without quota
+ */
+ [DTO_ATTR_SET_CHOWN] = 0
};
static const struct dt_device_operations osd_dt_ops = {
osd_trans_declare_op(env, oh, OSD_OT_CREATE,
osd_dto_credits_noquota[DTO_OBJECT_CREATE]);
- if (!fid_is_on_ost(osd_oti_get(env), osd_dt_dev(handle->th_dev),
- lu_object_fid(&dt->do_lu), OI_CHECK_FLD))
- /* Reuse idle OI block may cause additional one OI block
- * to be changed. */
- osd_trans_declare_op(env, oh, OSD_OT_INSERT,
- osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1);
+ /* Reuse idle OI block may cause additional one OI block
+ * to be changed. */
+ osd_trans_declare_op(env, oh, OSD_OT_INSERT,
+ osd_dto_credits_noquota[DTO_INDEX_INSERT] + 1);
/* If this is directory, then we expect . and .. to be inserted as
* well. The one directory block always needs to be created for the
int fl, struct thandle *handle)
{
struct osd_thandle *oh;
+ int credits;
LASSERT(handle != NULL);
oh = container_of0(handle, struct osd_thandle, ot_super);
LASSERT(oh->ot_handle == NULL);
- osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET,
- strcmp(name, XATTR_NAME_VERSION) == 0 ?
- osd_dto_credits_noquota[DTO_ATTR_SET_BASE] :
- osd_dto_credits_noquota[DTO_XATTR_SET]);
+ /* optimistic optimization: LMA is set first and usually fit inode */
+ if (strcmp(name, XATTR_NAME_LMA) == 0) {
+ if (dt_object_exists(dt))
+ credits = 0;
+ else
+ credits = 1;
+ } else if (strcmp(name, XATTR_NAME_VERSION) == 0) {
+ credits = 1;
+ } else {
+ struct osd_device *osd = osd_dev(dt->do_lu.lo_dev);
+ struct super_block *sb = osd_sb(osd);
+ credits = osd_dto_credits_noquota[DTO_XATTR_SET];
+ if (buf && buf->lb_len > sb->s_blocksize) {
+ credits *= (buf->lb_len + sb->s_blocksize - 1) >>
+ sb->s_blocksize_bits;
+ }
+ }
+
+ osd_trans_declare_op(env, oh, OSD_OT_XATTR_SET, credits);
return 0;
}
DTO_OBJECT_DELETE,
DTO_ATTR_SET_BASE,
DTO_XATTR_SET,
- DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */
DTO_WRITE_BASE,
DTO_WRITE_BLOCK,
DTO_ATTR_SET_CHOWN,
return rc;
}
+static inline int osd_extents_enabled(struct super_block *sb,
+ struct inode *inode)
+{
+ if (inode != NULL) {
+ if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL)
+ return 1;
+ } else if (test_opt(sb, EXTENTS)) {
+ return 1;
+ }
+ return 0;
+}
+
+static inline int osd_calc_bkmap_credits(struct super_block *sb,
+ struct inode *inode,
+ const loff_t size,
+ const loff_t pos,
+ const int blocks)
+{
+ int credits, bits, bs, i;
+
+ bits = sb->s_blocksize_bits;
+ bs = 1 << bits;
+
+ /* legacy blockmap: 3 levels * 3 (bitmap,gd,itself)
+ * we do not expect blockmaps on the large files,
+ * so let's shrink it to 2 levels (4GB files) */
+
+ /* this is default reservation: 2 levels */
+ credits = (blocks + 2) * 3;
+
+ /* actual offset is unknown, hard to optimize */
+ if (pos == -1)
+ return credits;
+
+ /* now check for few specific cases to optimize */
+ if (pos + size <= LDISKFS_NDIR_BLOCKS * bs) {
+ /* no indirects */
+ credits = blocks;
+ /* allocate if not allocated */
+ if (inode == NULL) {
+ credits += blocks * 2;
+ return credits;
+ }
+ for (i = (pos >> bits); i < (pos >> bits) + blocks; i++) {
+ LASSERT(i < LDISKFS_NDIR_BLOCKS);
+ if (LDISKFS_I(inode)->i_data[i] == 0)
+ credits += 2;
+ }
+ } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) {
+ /* single indirect */
+ credits = blocks * 3;
+ /* probably indirect block has been allocated already */
+ if (!inode || LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK])
+ credits += 3;
+ }
+
+ return credits;
+}
+
static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
- const struct lu_buf *buf, loff_t pos,
+ const struct lu_buf *buf, loff_t _pos,
struct thandle *handle)
{
- struct osd_thandle *oh;
- int credits;
- struct inode *inode;
- int rc;
+ struct osd_object *obj = osd_dt_obj(dt);
+ struct inode *inode = obj->oo_inode;
+ struct super_block *sb = osd_sb(osd_obj2dev(obj));
+ struct osd_thandle *oh;
+ int rc = 0, est = 0, credits, blocks, allocated = 0;
+ int bits, bs;
+ int depth, size;
+ loff_t pos;
ENTRY;
+ LASSERT(buf != NULL);
LASSERT(handle != NULL);
oh = container_of0(handle, struct osd_thandle, ot_super);
LASSERT(oh->ot_handle == NULL);
- credits = osd_dto_credits_noquota[DTO_WRITE_BLOCK];
+ size = buf->lb_len;
+ bits = sb->s_blocksize_bits;
+ bs = 1 << bits;
- osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
+ if (_pos == -1) {
+ /* if this is an append, then we
+ * should expect cross-block record */
+ pos = 0;
+ } else {
+ pos = _pos;
+ }
- inode = osd_dt_obj(dt)->oo_inode;
+ /* blocks to modify */
+ blocks = ((pos + size + bs - 1) >> bits) - (pos >> bits);
+ LASSERT(blocks > 0);
+
+ if (inode != NULL && _pos != -1) {
+ /* object size in blocks */
+ est = (i_size_read(inode) + bs - 1) >> bits;
+ allocated = inode->i_blocks >> (bits - 9);
+ if (pos + size <= i_size_read(inode) && est <= allocated) {
+ /* looks like an overwrite, no need to modify tree */
+ credits = blocks;
+ /* no need to modify i_size */
+ goto out;
+ }
+ }
+
+ if (osd_extents_enabled(sb, inode)) {
+ /*
+ * many concurrent threads may grow tree by the time
+ * our transaction starts. so, consider 2 is a min depth
+ * for every level we may need to allocate a new block
+ * and take some entries from the old one. so, 3 blocks
+ * to allocate (bitmap, gd, itself) + old block - 4 per
+ * level.
+ */
+ depth = inode != NULL ? ext_depth(inode) : 0;
+ depth = max(depth, 1) + 1;
+ credits = depth;
+ /* if not append, then split may need to modify
+ * existing blocks moving entries into the new ones */
+ if (_pos == -1)
+ credits += depth;
+ /* blocks to store data: bitmap,gd,itself */
+ credits += blocks * 3;
+ } else {
+ credits = osd_calc_bkmap_credits(sb, inode, size, _pos, blocks);
+ }
+ /* if inode is created as part of the transaction,
+ * then it's counted already by the creation method */
+ if (inode != NULL)
+ credits++;
+
+out:
- /* we may declare write to non-exist llog */
- if (inode == NULL)
- RETURN(0);
+ osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
/* dt_declare_write() is usually called for system objects, such
* as llog or last_rcvd files. We needn't enforce quota on those
* objects, so always set the lqi_space as 0. */
- rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh,
- true, true, NULL, false);
+ if (inode != NULL)
+ rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid,
+ 0, oh, true, true, NULL, false);
RETURN(rc);
}
struct inode *inode;
int rc;
struct iam_container *bag = &(osd_dt_obj(dt))->oo_dir->od_container;
+ struct lu_buf *lb = &osd_oti_get(env)->oti_buf;
ENTRY;
LASSERT(bag->ic_root_bh != NULL);
inode = osd_dt_obj(dt)->oo_inode;
LASSERT(inode);
- rc = dt_declare_record_write(env, dt, NULL, 0, th);
+ /* iam_lfix_create() writes two blocks at the beginning */
+ lb->lb_len = osd_sb(osd)->s_blocksize * 2;
+ rc = dt_declare_record_write(env, dt, lb, 0, th);
if (rc)
GOTO(out, rc);
#include <obd_class.h>
#include <lustre_disk.h>
#include <lustre_fid.h>
+#include <lustre/lustre_idl.h> /* LLOG_CHUNK_SIZE definition */
#include "osd_internal.h"
dmu_tx_hold_sa_create(oh->ot_tx, ZFS_SA_BASE_ATTR_SIZE);
}
+ /* XXX: we still miss for append declaration support in ZFS
+ * -1 means append which is used by llog mostly, llog
+ * can grow upto LLOG_CHUNK_SIZE*8 records */
+ if (pos == -1)
+ pos = max_t(loff_t, 256 * 8 * LLOG_CHUNK_SIZE,
+ obj->oo_attr.la_size + (2 << 20));
dmu_tx_hold_write(oh->ot_tx, oid, pos, buf->lb_len);
/* dt_declare_write() is usually called for system objects, such