CERROR("***** bio->bi_private is NULL! This should never "
"happen. Normally, I would crash here, but instead I "
"will dump the bio contents to the console. Please "
- "report this to <http://jira.whamcloud.com/> , along "
+ "report this to <https://jira.hpdd.intel.com/> , along "
"with any interesting messages leading up to this point "
"(like SCSI errors, perhaps). Because bi_private is "
"NULL, I can't wake up the thread that initiated this "
if (iobuf->dr_rw == 0) {
wait_event(iobuf->dr_wait,
atomic_read(&iobuf->dr_numreqs) == 0);
+ osd_fini_iobuf(osd, iobuf);
}
if (rc == 0)
osd_map_remote_to_local(pos, len, &npages, lnb);
for (i = 0; i < npages; i++, lnb++) {
-
- /* We still set up for ungranted pages so that granted pages
- * can be written to disk as they were promised, and portals
- * needs to keep the pages all aligned properly. */
- lnb->dentry = (void *) obj;
-
lnb->page = osd_get_page(d, lnb->lnb_file_offset, rw);
if (lnb->page == NULL)
GOTO(cleanup, rc = -ENOMEM);
static int osd_bufs_put(const struct lu_env *env, struct dt_object *dt,
struct niobuf_local *lnb, int npages)
{
- struct osd_thread_info *oti = osd_oti_get(env);
- struct osd_iobuf *iobuf = &oti->oti_iobuf;
- struct osd_device *d = osd_obj2dev(osd_dt_obj(dt));
int i;
- /* to do IO stats, notice we do this here because
- * osd_do_bio() doesn't wait for write to complete */
- osd_fini_iobuf(d, iobuf);
-
for (i = 0; i < npages; i++) {
if (lnb[i].page == NULL)
continue;
* but otherwise we'd need to call it every free() */
ldiskfs_discard_preallocations(inode);
#ifdef HAVE_EXT_FREE_BLOCK_WITH_BUFFER_HEAD /* Introduced in 2.6.32-rc7 */
- ldiskfs_free_blocks(handle, inode, NULL, ldiskfs_ext_pblock(&nex),
- cpu_to_le16(nex.ee_len), 0);
+ ldiskfs_free_blocks(handle, inode, NULL,
+ ldiskfs_ext_pblock(&nex),
+ le16_to_cpu(nex.ee_len), 0);
#else
ldiskfs_free_blocks(handle, inode, ldiskfs_ext_pblock(&nex),
- cpu_to_le16(nex.ee_len), 0);
+ le16_to_cpu(nex.ee_len), 0);
#endif
goto out;
}
static int osd_ldiskfs_map_inode_pages(struct inode *inode, struct page **page,
int pages, unsigned long *blocks,
- int create, struct mutex *optional_mutex)
+ int create)
{
int rc;
blocks, create);
return rc;
}
- if (optional_mutex != NULL)
- mutex_lock(optional_mutex);
rc = osd_ldiskfs_map_bm_inode_pages(inode, page, pages, blocks, create);
- if (optional_mutex != NULL)
- mutex_unlock(optional_mutex);
return rc;
}
if (iobuf->dr_npages) {
rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages,
iobuf->dr_npages,
- iobuf->dr_blocks,
- 0, NULL);
+ iobuf->dr_blocks, 0);
if (likely(rc == 0)) {
rc = osd_do_bio(osd, inode, iobuf);
/* do IO stats for preparation reads */
/* make sure the over quota flags were not set */
lnb[0].flags &= ~(OBD_BRW_OVER_USRQUOTA | OBD_BRW_OVER_GRPQUOTA);
- rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid,
+ rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
quota_space, oh, true, true, &flags,
ignore_quota);
} else if (iobuf->dr_npages > 0) {
rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages,
iobuf->dr_npages,
- iobuf->dr_blocks,
- 1, NULL);
+ iobuf->dr_blocks, 1);
} else {
/* no pages to write, no transno is needed */
thandle->th_local = 1;
rc = osd_do_bio(osd, inode, iobuf);
/* we don't do stats here as in read path because
* write is async: we'll do this in osd_put_bufs() */
- }
+ } else {
+ osd_fini_iobuf(osd, iobuf);
+ }
if (unlikely(rc != 0)) {
/* if write fails, we should drop pages from the cache */
struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
struct timeval start, end;
unsigned long timediff;
- int rc = 0, i, m = 0, cache = 0;
+ int rc = 0, i, m = 0, cache = 0, cache_hits = 0, cache_misses = 0;
LASSERT(inode);
lnb[i].rc = lnb[i].len;
m += lnb[i].len;
- lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_ACCESS, 1);
- if (PageUptodate(lnb[i].page)) {
- lprocfs_counter_add(osd->od_stats,
- LPROC_OSD_CACHE_HIT, 1);
- } else {
- lprocfs_counter_add(osd->od_stats,
- LPROC_OSD_CACHE_MISS, 1);
- osd_iobuf_add_page(iobuf, lnb[i].page);
- }
+ if (PageUptodate(lnb[i].page)) {
+ cache_hits++;
+ } else {
+ cache_misses++;
+ osd_iobuf_add_page(iobuf, lnb[i].page);
+ }
+
if (cache == 0)
generic_error_remove_page(inode->i_mapping,lnb[i].page);
}
timediff = cfs_timeval_sub(&end, &start, NULL);
lprocfs_counter_add(osd->od_stats, LPROC_OSD_GET_PAGE, timediff);
+ if (cache_hits != 0)
+ lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_HIT,
+ cache_hits);
+ if (cache_misses != 0)
+ lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_MISS,
+ cache_misses);
+ if (cache_hits + cache_misses != 0)
+ lprocfs_counter_add(osd->od_stats, LPROC_OSD_CACHE_ACCESS,
+ cache_hits + cache_misses);
+
if (iobuf->dr_npages) {
rc = osd_ldiskfs_map_inode_pages(inode, iobuf->dr_pages,
iobuf->dr_npages,
- iobuf->dr_blocks,
- 0, NULL);
+ iobuf->dr_blocks, 0);
rc = osd_do_bio(osd, inode, iobuf);
/* IO stats will be done in osd_bufs_put() */
return rc;
}
+static inline int osd_extents_enabled(struct super_block *sb,
+ struct inode *inode)
+{
+ if (inode != NULL) {
+ if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL)
+ return 1;
+ } else if (test_opt(sb, EXTENTS)) {
+ return 1;
+ }
+ return 0;
+}
+
+static inline int osd_calc_bkmap_credits(struct super_block *sb,
+ struct inode *inode,
+ const loff_t size,
+ const loff_t pos,
+ const int blocks)
+{
+ int credits, bits, bs, i;
+
+ bits = sb->s_blocksize_bits;
+ bs = 1 << bits;
+
+ /* legacy blockmap: 3 levels * 3 (bitmap,gd,itself)
+ * we do not expect blockmaps on the large files,
+ * so let's shrink it to 2 levels (4GB files) */
+
+ /* this is default reservation: 2 levels */
+ credits = (blocks + 2) * 3;
+
+ /* actual offset is unknown, hard to optimize */
+ if (pos == -1)
+ return credits;
+
+ /* now check for few specific cases to optimize */
+ if (pos + size <= LDISKFS_NDIR_BLOCKS * bs) {
+ /* no indirects */
+ credits = blocks;
+ /* allocate if not allocated */
+ if (inode == NULL) {
+ credits += blocks * 2;
+ return credits;
+ }
+ for (i = (pos >> bits); i < (pos >> bits) + blocks; i++) {
+ LASSERT(i < LDISKFS_NDIR_BLOCKS);
+ if (LDISKFS_I(inode)->i_data[i] == 0)
+ credits += 2;
+ }
+ } else if (pos + size <= (LDISKFS_NDIR_BLOCKS + 1024) * bs) {
+ /* single indirect */
+ credits = blocks * 3;
+ /* probably indirect block has been allocated already */
+ if (!inode || LDISKFS_I(inode)->i_data[LDISKFS_IND_BLOCK])
+ credits += 3;
+ }
+
+ return credits;
+}
+
static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
- const struct lu_buf *buf, loff_t pos,
+ const struct lu_buf *buf, loff_t _pos,
struct thandle *handle)
{
- struct osd_thandle *oh;
- int credits;
- struct inode *inode;
- int rc;
+ struct osd_object *obj = osd_dt_obj(dt);
+ struct inode *inode = obj->oo_inode;
+ struct super_block *sb = osd_sb(osd_obj2dev(obj));
+ struct osd_thandle *oh;
+ int rc = 0, est = 0, credits, blocks, allocated = 0;
+ int bits, bs;
+ int depth, size;
+ loff_t pos;
ENTRY;
+ LASSERT(buf != NULL);
LASSERT(handle != NULL);
oh = container_of0(handle, struct osd_thandle, ot_super);
LASSERT(oh->ot_handle == NULL);
- credits = osd_dto_credits_noquota[DTO_WRITE_BLOCK];
+ size = buf->lb_len;
+ bits = sb->s_blocksize_bits;
+ bs = 1 << bits;
- osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
+ if (_pos == -1) {
+ /* if this is an append, then we
+ * should expect cross-block record */
+ pos = 0;
+ } else {
+ pos = _pos;
+ }
- inode = osd_dt_obj(dt)->oo_inode;
+ /* blocks to modify */
+ blocks = ((pos + size + bs - 1) >> bits) - (pos >> bits);
+ LASSERT(blocks > 0);
+
+ if (inode != NULL && _pos != -1) {
+ /* object size in blocks */
+ est = (i_size_read(inode) + bs - 1) >> bits;
+ allocated = inode->i_blocks >> (bits - 9);
+ if (pos + size <= i_size_read(inode) && est <= allocated) {
+ /* looks like an overwrite, no need to modify tree */
+ credits = blocks;
+ /* no need to modify i_size */
+ goto out;
+ }
+ }
- /* we may declare write to non-exist llog */
- if (inode == NULL)
- RETURN(0);
+ if (osd_extents_enabled(sb, inode)) {
+ /*
+ * many concurrent threads may grow tree by the time
+ * our transaction starts. so, consider 2 is a min depth
+ * for every level we may need to allocate a new block
+ * and take some entries from the old one. so, 3 blocks
+ * to allocate (bitmap, gd, itself) + old block - 4 per
+ * level.
+ */
+ depth = inode != NULL ? ext_depth(inode) : 0;
+ depth = max(depth, 1) + 1;
+ credits = depth;
+ /* if not append, then split may need to modify
+ * existing blocks moving entries into the new ones */
+ if (_pos == -1)
+ credits += depth;
+ /* blocks to store data: bitmap,gd,itself */
+ credits += blocks * 3;
+ } else {
+ credits = osd_calc_bkmap_credits(sb, inode, size, _pos, blocks);
+ }
+ /* if inode is created as part of the transaction,
+ * then it's counted already by the creation method */
+ if (inode != NULL)
+ credits++;
+
+out:
+
+ osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
/* dt_declare_write() is usually called for system objects, such
* as llog or last_rcvd files. We needn't enforce quota on those
* objects, so always set the lqi_space as 0. */
- rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh,
- true, true, NULL, false);
+ if (inode != NULL)
+ rc = osd_declare_inode_qid(env, i_uid_read(inode),
+ i_gid_read(inode), 0, oh, true,
+ true, NULL, false);
RETURN(rc);
}
err);
break;
}
- LASSERTF(boffs + size <= bh->b_size,
- "boffs %d size %d bh->b_size %lu",
- boffs, size, (unsigned long)bh->b_size);
+ LASSERTF(boffs + size <= bh->b_size,
+ "boffs %d size %d bh->b_size %lu\n",
+ boffs, size, (unsigned long)bh->b_size);
memcpy(bh->b_data + boffs, buf, size);
err = ldiskfs_journal_dirty_metadata(handle, bh);
if (err)
inode = osd_dt_obj(dt)->oo_inode;
LASSERT(inode);
- rc = osd_declare_inode_qid(env, inode->i_uid, inode->i_gid, 0, oh,
- true, true, NULL, false);
+ rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
+ 0, oh, true, true, NULL, false);
RETURN(rc);
}