Whamcloud - gitweb
LU-14776 ldiskfs: Add Ubuntu 20.04 HWE support
[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_io.c
index d2a9c88..33a4526 100644 (file)
@@ -27,7 +27,6 @@
  */
 /*
  * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
  *
  * lustre/osd/osd_io.c
  *
@@ -45,6 +44,7 @@
 /* prerequisite for linux/xattr.h */
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
 #include <linux/pagevec.h>
 
 /*
@@ -57,6 +57,7 @@
 
 /* ext_depth() */
 #include <ldiskfs/ldiskfs_extents.h>
+#include <ldiskfs/ldiskfs.h>
 
 static inline bool osd_use_page_cache(struct osd_device *d)
 {
@@ -907,6 +908,8 @@ bypass_checks:
                        GOTO(cleanup, rc = -ENOMEM);
 
                lnb->lnb_locked = 1;
+               if (cache)
+                       mark_page_accessed(lnb->lnb_page);
        }
 
 #if 0
@@ -1210,12 +1213,12 @@ cont_map:
                        GOTO(cleanup, rc);
                /*
                 * decay extent blocks if we could allocate
-                * good large(1M) extent.
+                * good large extent.
                 */
-               if (previous_total == 0 &&
-                   total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+               if (total - previous_total >=
+                   osd_extent_bytes(osd) >> inode->i_blkbits)
                        osd_decay_extent_bytes(osd,
-                                              total << inode->i_blkbits);
+                               (total - previous_total) << inode->i_blkbits);
                /* look for next extent */
                fp = NULL;
                blocks += blocks_per_page * clen;
@@ -1300,6 +1303,7 @@ static int osd_write_prep(const struct lu_env *env, struct dt_object *dt,
 struct osd_fextent {
        sector_t        start;
        sector_t        end;
+       __u32           flags;
        unsigned int    mapped:1;
 };
 
@@ -1311,7 +1315,6 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
        sector_t start;
        struct fiemap_extent_info fei = { 0 };
        struct fiemap_extent fe = { 0 };
-       mm_segment_t saved_fs;
        int rc;
 
        if (block >= cached_extent->start && block < cached_extent->end)
@@ -1327,14 +1330,22 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
        fei.fi_extents_max = 1;
        fei.fi_extents_start = &fe;
 
-       saved_fs = get_fs();
-       set_fs(KERNEL_DS);
        rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
-       set_fs(saved_fs);
        if (rc != 0)
                return 0;
 
        start = fe.fe_logical >> inode->i_blkbits;
+       cached_extent->flags = fe.fe_flags;
+       if (fei.fi_extents_mapped == 0) {
+               /* a special case - no extent found at this offset and forward.
+                * we can consider this as a hole to EOF. it's safe to cache
+                * as other threads can not allocate/punch blocks this thread
+                * is working on (LDLM). */
+               cached_extent->start = block;
+               cached_extent->end = i_size_read(inode) >> inode->i_blkbits;
+               cached_extent->mapped = 0;
+               return 0;
+       }
 
        if (start > block) {
                cached_extent->start = block;
@@ -1350,6 +1361,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
        return cached_extent->mapped;
 }
 
+#define MAX_EXTENTS_PER_WRITE 100
 static int osd_declare_write_commit(const struct lu_env *env,
                                    struct dt_object *dt,
                                    struct niobuf_local *lnb, int npages,
@@ -1358,10 +1370,10 @@ static int osd_declare_write_commit(const struct lu_env *env,
        const struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt));
        struct inode            *inode = osd_dt_obj(dt)->oo_inode;
        struct osd_thandle      *oh;
-       int                     extents = 0;
-       int                     depth;
+       int                     extents = 0, new_meta = 0;
+       int                     depth, new_blocks = 0;
        int                     i;
-       int                     newblocks = 0;
+       int                     dirty_groups = 0;
        int                     rc = 0;
        int                     credits = 0;
        long long               quota_space = 0;
@@ -1400,7 +1412,12 @@ static int osd_declare_write_commit(const struct lu_env *env,
                    OBD_BRW_FROM_GRANT)
                        declare_flags |= OSD_QID_FORCE;
 
-               if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped)) {
+               /*
+                * Convert unwritten extent might need split extents, could
+                * not skip it.
+                */
+               if (osd_is_mapped(dt, lnb[i].lnb_file_offset, &mapped) &&
+                   !(mapped.flags & FIEMAP_EXTENT_UNWRITTEN)) {
                        lnb[i].lnb_flags |= OBD_BRW_MAPPED;
                        continue;
                }
@@ -1411,11 +1428,11 @@ static int osd_declare_write_commit(const struct lu_env *env,
                }
 
                /* count only unmapped changes */
-               newblocks++;
+               new_blocks++;
                if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
                        if (extent.end != 0)
                                extents += (extent.end - extent.start +
-                                       extent_bytes - 1) / extent_bytes;
+                                           extent_bytes - 1) / extent_bytes;
                        extent.start = lnb[i].lnb_file_offset;
                        extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
                } else {
@@ -1430,35 +1447,55 @@ static int osd_declare_write_commit(const struct lu_env *env,
         * overwrite case, no need to modify tree and
         * allocate blocks.
         */
-       if (!newblocks)
+       if (!extent.end)
                goto out_declare;
 
        extents += (extent.end - extent.start +
                    extent_bytes - 1) / extent_bytes;
-       /*
-        * each extent can go into new leaf causing a split
-        * 5 is max tree depth: inode + 4 index blocks
-        * with blockmaps, depth is 3 at most
+       /**
+        * with system space usage growing up, mballoc codes won't
+        * try best to scan block group to align best free extent as
+        * we can. So extent bytes per extent could be decayed to a
+        * very small value, this could make us reserve too many credits.
+        * We could be more optimistic in the credit reservations, even
+        * in a case where the filesystem is nearly full, it is extremely
+        * unlikely that the worst case would ever be hit.
+        */
+       if (extents > MAX_EXTENTS_PER_WRITE)
+               extents = MAX_EXTENTS_PER_WRITE;
+
+       /**
+        * If we add a single extent, then in the worse case, each tree
+        * level index/leaf need to be changed in case of the tree split.
+        * If more extents are inserted, they could cause the whole tree
+        * split more than once, but this is really rare.
         */
        if (LDISKFS_I(inode)->i_flags & LDISKFS_EXTENTS_FL) {
-               /*
-                * many concurrent threads may grow tree by the time
-                * our transaction starts. so, consider 2 is a min depth
-                */
                depth = ext_depth(inode);
-               depth = max(depth, 1) + 1;
-               newblocks += depth;
-               credits += depth * 2 * extents;
+               if (extents <= 1) {
+                       credits += depth * 2 * extents;
+                       new_meta = depth;
+               } else {
+                       credits += depth * 3 * extents;
+                       new_meta = depth * 2 * extents;
+               }
        } else {
+               /*
+                * With N contiguous data blocks, we need at most
+                * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) + 1 indirect blocks,
+                * 2 dindirect blocks, and 1 tindirect block
+                */
+               new_meta = DIV_ROUND_UP(new_blocks,
+                               LDISKFS_ADDR_PER_BLOCK(inode->i_sb)) + 4;
+               credits += new_meta;
                depth = 3;
-               newblocks += depth;
-               credits += depth * extents;
        }
+       dirty_groups += (extents + new_meta);
 
        oh->oh_declared_ext = extents;
 
        /* quota space for metadata blocks */
-       quota_space += depth * extents * LDISKFS_BLOCK_SIZE(osd_sb(osd));
+       quota_space += new_meta * LDISKFS_BLOCK_SIZE(osd_sb(osd));
 
        /* quota space should be reported in 1K blocks */
        quota_space = toqb(quota_space);
@@ -1466,16 +1503,21 @@ static int osd_declare_write_commit(const struct lu_env *env,
        /* each new block can go in different group (bitmap + gd) */
 
        /* we can't dirty more bitmap blocks than exist */
-       if (extents > LDISKFS_SB(osd_sb(osd))->s_groups_count)
+       if (dirty_groups > LDISKFS_SB(osd_sb(osd))->s_groups_count)
                credits += LDISKFS_SB(osd_sb(osd))->s_groups_count;
        else
-               credits += extents;
+               credits += dirty_groups;
 
        /* we can't dirty more gd blocks than exist */
        if (extents > LDISKFS_SB(osd_sb(osd))->s_gdb_count)
                credits += LDISKFS_SB(osd_sb(osd))->s_gdb_count;
        else
-               credits += extents;
+               credits += dirty_groups;
+
+       CDEBUG(D_INODE,
+              "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+              osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+              credits);
 
 out_declare:
        osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
@@ -2164,10 +2206,10 @@ static int osd_declare_fallocate(const struct lu_env *env,
        ENTRY;
 
        /*
-        * Only mode == 0 (which is standard prealloc) is supported now.
+        * mode == 0 (which is standard prealloc) and PUNCH is supported
         * Rest of mode options is not supported yet.
         */
-       if (mode & ~FALLOC_FL_KEEP_SIZE)
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
                RETURN(-EOPNOTSUPP);
 
        /* disable fallocate completely */
@@ -2177,6 +2219,16 @@ static int osd_declare_fallocate(const struct lu_env *env,
        LASSERT(th);
        LASSERT(inode);
 
+       if (mode & FALLOC_FL_PUNCH_HOLE) {
+               rc = osd_declare_inode_qid(env, i_uid_read(inode),
+                                          i_gid_read(inode),
+                                          i_projid_read(inode), 0, oh,
+                                          osd_dt_obj(dt), NULL, OSD_QID_BLK);
+               if (rc == 0)
+                       rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
+               RETURN(rc);
+       }
+
        /* quota space for metadata blocks
         * approximate metadata estimate should be good enough.
         */
@@ -2197,8 +2249,10 @@ static int osd_declare_fallocate(const struct lu_env *env,
        RETURN(rc);
 }
 
-static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
-                        __u64 start, __u64 end, int mode, struct thandle *th)
+static int osd_fallocate_preallocate(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    __u64 start, __u64 end, int mode,
+                                    struct thandle *th)
 {
        struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
        handle_t *handle = ldiskfs_journal_current_handle();
@@ -2234,9 +2288,10 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
        flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
                LDISKFS_GET_BLOCKS_CREATE_ZERO :
                LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
        if (mode & FALLOC_FL_KEEP_SIZE)
                flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
-
+#endif
        inode_lock(inode);
 
        /*
@@ -2304,10 +2359,12 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
                                epos = end;
                        if (ldiskfs_update_inode_size(inode, epos) & 0x1)
                                inode->i_mtime = inode->i_ctime;
+#ifndef HAVE_LDISKFS_GET_BLOCKS_KEEP_SIZE
                } else {
                        if (epos > inode->i_size)
                                ldiskfs_set_inode_flag(inode,
                                                       LDISKFS_INODE_EOFBLOCKS);
+#endif
                }
 
                ldiskfs_mark_inode_dirty(handle, inode);
@@ -2323,6 +2380,61 @@ out:
        RETURN(rc);
 }
 
+static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
+                              __u64 start, __u64 end, int mode,
+                              struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode = obj->oo_inode;
+       struct osd_access_lock *al;
+       struct osd_thandle *oh;
+       int rc = 0, found = 0;
+
+       ENTRY;
+
+       LASSERT(dt_object_exists(dt));
+       LASSERT(osd_invariant(obj));
+       LASSERT(inode != NULL);
+
+       dquot_initialize(inode);
+
+       LASSERT(th);
+       oh = container_of(th, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle->h_transaction != NULL);
+
+       list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
+               if (obj != al->tl_obj)
+                       continue;
+               LASSERT(al->tl_shared == 0);
+               found = 1;
+               /* do actual punch in osd_trans_stop() */
+               al->tl_start = start;
+               al->tl_end = end;
+               al->tl_mode = mode;
+               al->tl_punch = true;
+               break;
+       }
+
+       RETURN(rc);
+}
+
+static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
+                        __u64 start, __u64 end, int mode, struct thandle *th)
+{
+       int rc;
+
+       ENTRY;
+
+       if (mode & FALLOC_FL_PUNCH_HOLE) {
+               /* punch */
+               rc = osd_fallocate_punch(env, dt, start, end, mode, th);
+       } else {
+               /* standard preallocate */
+               rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
+       }
+       RETURN(rc);
+}
+
 static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
                             __u64 start, __u64 end, struct thandle *th)
 {
@@ -2475,7 +2587,6 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
        struct inode *inode = osd_dt_obj(dt)->oo_inode;
        u64 len;
        int rc;
-       mm_segment_t cur_fs;
 
        LASSERT(inode);
        if (inode->i_op->fiemap == NULL)
@@ -2495,18 +2606,10 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
        if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
                filemap_write_and_wait(inode->i_mapping);
 
-       /* Save previous value address limit */
-       cur_fs = get_fs();
-       /* Set the address limit of the kernel */
-       set_fs(KERNEL_DS);
-
        rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
        fm->fm_flags = fieinfo.fi_flags;
        fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
 
-       /* Restore the previous address limt */
-       set_fs(cur_fs);
-
        return rc;
 }
 
@@ -2654,6 +2757,27 @@ void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
        }
 }
 
+/*
+ * For a partial-page truncate, flush the page to disk immediately to
+ * avoid data corruption during direct disk write.  b=17397
+ */
+static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
+                                  loff_t offset)
+{
+       if (!(offset & ~PAGE_MASK))
+               return;
+
+       if (osd_use_page_cache(d)) {
+               filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
+       } else {
+               /* Notice we use "wait" version to ensure I/O is complete */
+               filemap_write_and_wait_range(inode->i_mapping, offset,
+                                            offset + 1);
+               invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
+                                        offset >> PAGE_SHIFT);
+       }
+}
+
 void osd_execute_truncate(struct osd_object *obj)
 {
        struct osd_device *d = osd_obj2dev(obj);
@@ -2689,24 +2813,22 @@ void osd_execute_truncate(struct osd_object *obj)
                spin_unlock(&inode->i_lock);
                osd_dirty_inode(inode, I_DIRTY_DATASYNC);
        }
+       osd_partial_page_flush(d, inode, size);
+}
 
-       /*
-        * For a partial-page truncate, flush the page to disk immediately to
-        * avoid data corruption during direct disk write.  b=17397
-        */
-       if ((size & ~PAGE_MASK) == 0)
-               return;
-       if (osd_use_page_cache(d)) {
-               filemap_fdatawrite_range(inode->i_mapping, size, size + 1);
-       } else {
-               /* Notice we use "wait" version to ensure I/O is complete */
-               filemap_write_and_wait_range(inode->i_mapping, size, size + 1);
-               invalidate_mapping_pages(inode->i_mapping, size >> PAGE_SHIFT,
-                                        size >> PAGE_SHIFT);
-       }
+void osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
+                      loff_t start, loff_t end, int mode)
+{
+       struct osd_device *d = osd_obj2dev(obj);
+       struct inode *inode = obj->oo_inode;
+       struct file *file = osd_quasi_file(env, inode);
+
+       file->f_op->fallocate(file, mode, start, end - start);
+       osd_partial_page_flush(d, inode, start);
+       osd_partial_page_flush(d, inode, end - 1);
 }
 
-void osd_process_truncates(struct list_head *list)
+void osd_process_truncates(const struct lu_env *env, struct list_head *list)
 {
        struct osd_access_lock *al;
 
@@ -2715,8 +2837,10 @@ void osd_process_truncates(struct list_head *list)
        list_for_each_entry(al, list, tl_list) {
                if (al->tl_shared)
                        continue;
-               if (!al->tl_truncate)
-                       continue;
-               osd_execute_truncate(al->tl_obj);
+               if (al->tl_truncate)
+                       osd_execute_truncate(al->tl_obj);
+               else if (al->tl_punch)
+                       osd_execute_punch(env, al->tl_obj, al->tl_start,
+                                         al->tl_end, al->tl_mode);
        }
 }