Whamcloud - gitweb
LU-15143 osd-ldiskfs: osd_declare_write() underestimates credits
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_io.c
index 40ae417..dc4208a 100644 (file)
@@ -150,9 +150,9 @@ void osd_fini_iobuf(struct osd_device *d, struct osd_iobuf *iobuf)
                iobuf->dr_elapsed_valid = 0;
                LASSERT(iobuf->dr_dev == d);
                LASSERT(iobuf->dr_frags > 0);
-               lprocfs_oh_tally(&d->od_brw_stats.hist[BRW_R_DIO_FRAGS+rw],
+               lprocfs_oh_tally(&d->od_brw_stats.bs_hist[BRW_R_DIO_FRAGS + rw],
                                 iobuf->dr_frags);
-               lprocfs_oh_tally_log2(&d->od_brw_stats.hist[BRW_R_IO_TIME+rw],
+               lprocfs_oh_tally_log2(&d->od_brw_stats.bs_hist[BRW_R_IO_TIME+rw],
                                      ktime_to_ms(iobuf->dr_elapsed));
        }
 }
@@ -173,7 +173,7 @@ static void dio_complete_routine(struct bio *bio, int error)
         */
 
        if (unlikely(iobuf == NULL)) {
-               CERROR("***** bio->bi_private is NULL!  This should never happen.  Normally, I would crash here, but instead I will dump the bio contents to the console.  Please report this to <https://jira.whamcloud.com/> , along with any interesting messages leading up to this point (like SCSI errors, perhaps).  Because bi_private is NULL, I can't wake up the thread that initiated this IO - you will probably have to reboot this node.\n");
+               CERROR("***** bio->bi_private is NULL! Dump the bio contents to the console. Please report this to <https://jira.whamcloud.com/>, and probably have to reboot this node.\n");
                CERROR("bi_next: %p, bi_flags: %lx, " __stringify(bi_opf)
                       ": %x, bi_vcnt: %d, bi_idx: %d, bi->size: %d, bi_end_io: %p, bi_cnt: %d, bi_private: %p\n",
                       bio->bi_next, (unsigned long)bio->bi_flags,
@@ -229,8 +229,8 @@ static void dio_complete_routine(struct bio *bio, int error)
 
 static void record_start_io(struct osd_iobuf *iobuf, int size)
 {
-       struct osd_device    *osd = iobuf->dr_dev;
-       struct obd_histogram *h = osd->od_brw_stats.hist;
+       struct osd_device *osd = iobuf->dr_dev;
+       struct obd_histogram *h = osd->od_brw_stats.bs_hist;
 
        iobuf->dr_frags++;
        atomic_inc(&iobuf->dr_numreqs);
@@ -1408,16 +1408,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
                /* ignore quota for the whole request if any page is from
                 * client cache or written by root.
                 *
-                * XXX once we drop the 1.8 client support, the checking
-                * for whether page is from cache can be simplified as:
-                * !(lnb[i].flags & OBD_BRW_SYNC)
-                *
                 * XXX we could handle this on per-lnb basis as done by
                 * grant.
                 */
                if ((lnb[i].lnb_flags & OBD_BRW_NOQUOTA) ||
-                   (lnb[i].lnb_flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) ==
-                   OBD_BRW_FROM_GRANT)
+                   (lnb[i].lnb_flags & OBD_BRW_SYS_RESOURCE) ||
+                   !(lnb[i].lnb_flags & OBD_BRW_SYNC))
                        declare_flags |= OSD_QID_FORCE;
 
                /*
@@ -1479,7 +1475,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
         * split more than once, but this is really rare.
         */
        if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
+               /*
+                * many concurrent threads may grow tree by the time
+                * our transaction starts. so, consider 2 is a min depth.
+                */
                depth = ext_depth(inode);
+               depth = min(max(depth, 1) + 1, LDISKFS_MAX_EXTENT_DEPTH);
                if (extents <= 1) {
                        credits += depth * 2 * extents;
                        new_meta = depth;
@@ -1496,7 +1497,6 @@ static int osd_declare_write_commit(const struct lu_env *env,
                new_meta = DIV_ROUND_UP(new_blocks,
                                LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
                credits += new_meta;
-               depth = 3;
        }
        dirty_groups += (extents + new_meta);
 
@@ -1517,7 +1517,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
                credits += dirty_groups;
 
        /* we can't dirty more gd blocks than exist */
-       if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
+       if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
                credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
        else
                credits += dirty_groups;
@@ -1956,7 +1956,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt,
                 * level.
                 */
                depth = inode != NULL ? ext_depth(inode) : 0;
-               depth = max(depth, 1) + 1;
+               depth = min(max(depth, 1) + 3, LDISKFS_MAX_EXTENT_DEPTH);
                credits = depth;
                /* if not append, then split may need to modify
                 * existing blocks moving entries into the new ones