LU-14641 osd-ldiskfs: write commit declaring improvement

[fs/lustre-release.git] / lustre / osd-ldiskfs / osd_io.c
diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c

index 66db3f5..f8568c7 100644 (file)
--- a/lustre/osd-ldiskfs/osd_io.c
+++ b/lustre/osd-ldiskfs/osd_io.c
@@ -27,7 +27,6 @@
   */
  /*
   * This file is part of Lustre, http://www.lustre.org/
- * Lustre is a trademark of Sun Microsystems, Inc.
   *
   * lustre/osd/osd_io.c
   *
@@ -1210,12 +1209,12 @@ cont_map:
                         GOTO(cleanup, rc);
                 /*
                  * decay extent blocks if we could allocate
-                * good large(1M) extent.
+                * good large extent.
                  */
-               if (previous_total == 0 &&
-                   total >= OSD_DEFAULT_EXTENT_BYTES >> inode->i_blkbits)
+               if (total - previous_total >=
+                   osd_extent_bytes(osd) >> inode->i_blkbits)
                         osd_decay_extent_bytes(osd,
-                                              total << inode->i_blkbits);
+                               (total - previous_total) << inode->i_blkbits);
                 /* look for next extent */
                 fp = NULL;
                 blocks += blocks_per_page * clen;
@@ -1311,7 +1310,6 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
         sector_t start;
         struct fiemap_extent_info fei = { 0 };
         struct fiemap_extent fe = { 0 };
-       mm_segment_t saved_fs;
         int rc;
  
         if (block >= cached_extent->start && block < cached_extent->end)
@@ -1327,14 +1325,21 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
         fei.fi_extents_max = 1;
         fei.fi_extents_start = &fe;
  
-       saved_fs = get_fs();
-       set_fs(KERNEL_DS);
         rc = inode->i_op->fiemap(inode, &fei, offset, FIEMAP_MAX_OFFSET-offset);
-       set_fs(saved_fs);
         if (rc != 0)
                 return 0;
  
         start = fe.fe_logical >> inode->i_blkbits;
+       if (fei.fi_extents_mapped == 0) {
+               /* a special case - no extent found at this offset and forward.
+                * we can consider this as a hole to EOF. it's safe to cache
+                * as other threads can not allocate/punch blocks this thread
+                * is working on (LDLM). */
+               cached_extent->start = block;
+               cached_extent->end = i_size_read(inode) >> inode->i_blkbits;
+               cached_extent->mapped = 0;
+               return 0;
+       }
  
         if (start > block) {
                 cached_extent->start = block;
@@ -1350,6 +1355,7 @@ static int osd_is_mapped(struct dt_object *dt, __u64 offset,
         return cached_extent->mapped;
  }
  
+#define MAX_EXTENTS_PER_WRITE 100
  static int osd_declare_write_commit(const struct lu_env *env,
                                     struct dt_object *dt,
                                     struct niobuf_local *lnb, int npages,
@@ -1415,7 +1421,7 @@ static int osd_declare_write_commit(const struct lu_env *env,
                 if (lnb[i].lnb_file_offset != extent.end || extent.end == 0) {
                         if (extent.end != 0)
                                 extents += (extent.end - extent.start +
-                                       extent_bytes - 1) / extent_bytes;
+                                           extent_bytes - 1) / extent_bytes;
                         extent.start = lnb[i].lnb_file_offset;
                         extent.end = lnb[i].lnb_file_offset + lnb[i].lnb_len;
                 } else {
@@ -1435,6 +1441,18 @@ static int osd_declare_write_commit(const struct lu_env *env,
  
         extents += (extent.end - extent.start +
                     extent_bytes - 1) / extent_bytes;
+       /**
+        * with system space usage growing up, mballoc codes won't
+        * try best to scan block group to align best free extent as
+        * we can. So extent bytes per extent could be decayed to a
+        * very small value, this could make us reserve too many credits.
+        * We could be more optimistic in the credit reservations, even
+        * in a case where the filesystem is nearly full, it is extremely
+        * unlikely that the worst case would ever be hit.
+        */
+       if (extents > MAX_EXTENTS_PER_WRITE)
+               extents = MAX_EXTENTS_PER_WRITE;
+
         /*
          * each extent can go into new leaf causing a split
          * 5 is max tree depth: inode + 4 index blocks
@@ -1477,6 +1495,11 @@ static int osd_declare_write_commit(const struct lu_env *env,
         else
                 credits += extents;
  
+       CDEBUG(D_INODE,
+              "%s: inode #%lu extent_bytes %u extents %d credits %d\n",
+              osd_ino2name(inode), inode->i_ino, extent_bytes, extents,
+              credits);
+
  out_declare:
         osd_trans_declare_op(env, oh, OSD_OT_WRITE, credits);
  
@@ -2164,15 +2187,29 @@ static int osd_declare_fallocate(const struct lu_env *env,
         ENTRY;
  
         /*
-        * Only mode == 0 (which is standard prealloc) is supported now.
+        * mode == 0 (which is standard prealloc) and PUNCH is supported
          * Rest of mode options is not supported yet.
          */
-       if (mode & ~FALLOC_FL_KEEP_SIZE)
+       if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+               RETURN(-EOPNOTSUPP);
+
+       /* disable fallocate completely */
+       if (osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks < 0)
                 RETURN(-EOPNOTSUPP);
  
         LASSERT(th);
         LASSERT(inode);
  
+       if (mode & FALLOC_FL_PUNCH_HOLE) {
+               rc = osd_declare_inode_qid(env, i_uid_read(inode),
+                                          i_gid_read(inode),
+                                          i_projid_read(inode), 0, oh,
+                                          osd_dt_obj(dt), NULL, OSD_QID_BLK);
+               if (rc == 0)
+                       rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
+               RETURN(rc);
+       }
+
         /* quota space for metadata blocks
          * approximate metadata estimate should be good enough.
          */
@@ -2193,8 +2230,10 @@ static int osd_declare_fallocate(const struct lu_env *env,
         RETURN(rc);
  }
  
-static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
-                        __u64 start, __u64 end, int mode, struct thandle *th)
+static int osd_fallocate_preallocate(const struct lu_env *env,
+                                    struct dt_object *dt,
+                                    __u64 start, __u64 end, int mode,
+                                    struct thandle *th)
  {
         struct osd_thandle *oh = container_of(th, struct osd_thandle, ot_super);
         handle_t *handle = ldiskfs_journal_current_handle();
@@ -2226,8 +2265,10 @@ static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
         boff = start >> inode->i_blkbits;
         blen = (ALIGN(end, 1 << inode->i_blkbits) >> inode->i_blkbits) - boff;
  
-       /* Create and Write zeros to new extents */
-       flags = LDISKFS_GET_BLOCKS_CREATE_ZERO;
+       /* Create and mark new extents as either zero or unwritten */
+       flags = osd_dev(dt->do_lu.lo_dev)->od_fallocate_zero_blocks ?
+               LDISKFS_GET_BLOCKS_CREATE_ZERO :
+               LDISKFS_GET_BLOCKS_CREATE_UNWRIT_EXT;
         if (mode & FALLOC_FL_KEEP_SIZE)
                 flags |= LDISKFS_GET_BLOCKS_KEEP_SIZE;
  
@@ -2317,6 +2358,61 @@ out:
         RETURN(rc);
  }
  
+static int osd_fallocate_punch(const struct lu_env *env, struct dt_object *dt,
+                              __u64 start, __u64 end, int mode,
+                              struct thandle *th)
+{
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct inode *inode = obj->oo_inode;
+       struct osd_access_lock *al;
+       struct osd_thandle *oh;
+       int rc = 0, found = 0;
+
+       ENTRY;
+
+       LASSERT(dt_object_exists(dt));
+       LASSERT(osd_invariant(obj));
+       LASSERT(inode != NULL);
+
+       dquot_initialize(inode);
+
+       LASSERT(th);
+       oh = container_of(th, struct osd_thandle, ot_super);
+       LASSERT(oh->ot_handle->h_transaction != NULL);
+
+       list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
+               if (obj != al->tl_obj)
+                       continue;
+               LASSERT(al->tl_shared == 0);
+               found = 1;
+               /* do actual punch in osd_trans_stop() */
+               al->tl_start = start;
+               al->tl_end = end;
+               al->tl_mode = mode;
+               al->tl_punch = true;
+               break;
+       }
+
+       RETURN(rc);
+}
+
+static int osd_fallocate(const struct lu_env *env, struct dt_object *dt,
+                        __u64 start, __u64 end, int mode, struct thandle *th)
+{
+       int rc;
+
+       ENTRY;
+
+       if (mode & FALLOC_FL_PUNCH_HOLE) {
+               /* punch */
+               rc = osd_fallocate_punch(env, dt, start, end, mode, th);
+       } else {
+               /* standard preallocate */
+               rc = osd_fallocate_preallocate(env, dt, start, end, mode, th);
+       }
+       RETURN(rc);
+}
+
  static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
                              __u64 start, __u64 end, struct thandle *th)
  {
@@ -2469,7 +2565,6 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
         struct inode *inode = osd_dt_obj(dt)->oo_inode;
         u64 len;
         int rc;
-       mm_segment_t cur_fs;
  
         LASSERT(inode);
         if (inode->i_op->fiemap == NULL)
@@ -2489,18 +2584,10 @@ static int osd_fiemap_get(const struct lu_env *env, struct dt_object *dt,
         if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC)
                 filemap_write_and_wait(inode->i_mapping);
  
-       /* Save previous value address limit */
-       cur_fs = get_fs();
-       /* Set the address limit of the kernel */
-       set_fs(KERNEL_DS);
-
         rc = inode->i_op->fiemap(inode, &fieinfo, fm->fm_start, len);
         fm->fm_flags = fieinfo.fi_flags;
         fm->fm_mapped_extents = fieinfo.fi_extents_mapped;
  
-       /* Restore the previous address limt */
-       set_fs(cur_fs);
-
         return rc;
  }
  
@@ -2648,6 +2735,27 @@ void osd_trunc_unlock_all(const struct lu_env *env, struct list_head *list)
         }
  }
  
+/*
+ * For a partial-page truncate, flush the page to disk immediately to
+ * avoid data corruption during direct disk write.  b=17397
+ */
+static void osd_partial_page_flush(struct osd_device *d, struct inode *inode,
+                                  loff_t offset)
+{
+       if (!(offset & ~PAGE_MASK))
+               return;
+
+       if (osd_use_page_cache(d)) {
+               filemap_fdatawrite_range(inode->i_mapping, offset, offset + 1);
+       } else {
+               /* Notice we use "wait" version to ensure I/O is complete */
+               filemap_write_and_wait_range(inode->i_mapping, offset,
+                                            offset + 1);
+               invalidate_mapping_pages(inode->i_mapping, offset >> PAGE_SHIFT,
+                                        offset >> PAGE_SHIFT);
+       }
+}
+
  void osd_execute_truncate(struct osd_object *obj)
  {
         struct osd_device *d = osd_obj2dev(obj);
@@ -2683,24 +2791,22 @@ void osd_execute_truncate(struct osd_object *obj)
                 spin_unlock(&inode->i_lock);
                 osd_dirty_inode(inode, I_DIRTY_DATASYNC);
         }
+       osd_partial_page_flush(d, inode, size);
+}
  
-       /*
-        * For a partial-page truncate, flush the page to disk immediately to
-        * avoid data corruption during direct disk write.  b=17397
-        */
-       if ((size & ~PAGE_MASK) == 0)
-               return;
-       if (osd_use_page_cache(d)) {
-               filemap_fdatawrite_range(inode->i_mapping, size, size + 1);
-       } else {
-               /* Notice we use "wait" version to ensure I/O is complete */
-               filemap_write_and_wait_range(inode->i_mapping, size, size + 1);
-               invalidate_mapping_pages(inode->i_mapping, size >> PAGE_SHIFT,
-                                        size >> PAGE_SHIFT);
-       }
+void osd_execute_punch(const struct lu_env *env, struct osd_object *obj,
+                      loff_t start, loff_t end, int mode)
+{
+       struct osd_device *d = osd_obj2dev(obj);
+       struct inode *inode = obj->oo_inode;
+       struct file *file = osd_quasi_file(env, inode);
+
+       file->f_op->fallocate(file, mode, start, end - start);
+       osd_partial_page_flush(d, inode, start);
+       osd_partial_page_flush(d, inode, end - 1);
  }
  
-void osd_process_truncates(struct list_head *list)
+void osd_process_truncates(const struct lu_env *env, struct list_head *list)
  {
         struct osd_access_lock *al;
  
@@ -2709,8 +2815,10 @@ void osd_process_truncates(struct list_head *list)
         list_for_each_entry(al, list, tl_list) {
                 if (al->tl_shared)
                         continue;
-               if (!al->tl_truncate)
-                       continue;
-               osd_execute_truncate(al->tl_obj);
+               if (al->tl_truncate)
+                       osd_execute_truncate(al->tl_obj);
+               else if (al->tl_punch)
+                       osd_execute_punch(env, al->tl_obj, al->tl_start,
+                                         al->tl_end, al->tl_mode);
         }
  }