Whamcloud - gitweb
LU-10048 osd: async truncate 88/27488/43
authorAlex Zhuravlev <alexey.zhuravlev@intel.com>
Wed, 7 Jun 2017 13:32:39 +0000 (17:32 +0400)
committerOleg Drokin <oleg.drokin@intel.com>
Thu, 14 Jun 2018 03:54:41 +0000 (03:54 +0000)
osd-ldiskfs should execute truncate outside of main transaction
handle. This avoids restarting truncate transaction handles in
main transaction, and allows "transaction first, locking second"
model on OST.

Change-Id: Iffe45c42834c26ca72b65e068ad25ac61d0607c8
Signed-off-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Reviewed-on: https://review.whamcloud.com/27488
Tested-by: Jenkins
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: Fan Yong <fan.yong@intel.com>
30 files changed:
ldiskfs/kernel_patches/patches/rhel7/ext4-export-orphan-add.patch [new file with mode: 0644]
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.7.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.8.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.9.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series
ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series
ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series
ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series
ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp4.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.5.series
ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series
ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12.series
ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12sp1.series
ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp2.series
ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp3.series
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-45-ubuntu14+16.series
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-49-ubuntu14+16.series
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-62-ubuntu14+16.series
ldiskfs/kernel_patches/series/ldiskfs-4.4.0-73-ubuntu14+16.series
lustre/include/obd_support.h
lustre/osd-ldiskfs/osd_handler.c
lustre/osd-ldiskfs/osd_internal.h
lustre/osd-ldiskfs/osd_io.c
lustre/tests/sanity.sh

diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-export-orphan-add.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-export-orphan-add.patch
new file mode 100644 (file)
index 0000000..8ccd1cd
--- /dev/null
@@ -0,0 +1,13 @@
+Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
+===================================================================
+--- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c
++++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c
+@@ -3250,6 +48,7 @@ int ext4_orphan_add(handle_t *handle
+       ext4_std_error(sb, err);
+       return err;
+ }
++EXPORT_SYMBOL(ext4_orphan_add);
+
+ /*
+  * ext4_orphan_del() removes an unlinked or truncated inode from the list
+
index 1947efd..412374d 100644 (file)
@@ -44,3 +44,4 @@ rhel6.3/ext4-drop-inode-from-orphan-list-if-ext4_delete_inode-fails.patch
 rhel6.3/ext4-notalloc_under_idatasem.patch
 rhel6.3/ext4-dont-check-in-ro.patch
 rhel6.3/ext4-dont-check-before-replay.patch
+rhel7/ext4-export-orphan-add.patch
index e662543..aca98ab 100644 (file)
@@ -48,3 +48,4 @@ rhel6.5/ext4-fix-journal-quota.patch
 rhel6.3/ext4-dont-check-in-ro.patch
 rhel6.3/ext4-dont-check-before-replay.patch
 rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch
+rhel7/ext4-export-orphan-add.patch
index 84962a9..14a2e6c 100644 (file)
@@ -48,3 +48,4 @@ rhel6.5/ext4-give-warning-with-dir-htree-growing.patch
 rhel6.6/ext4_s_max_ext_tree_depth.patch
 rhel6.5/ext4-fix-journal-quota.patch
 rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch
+rhel7/ext4-export-orphan-add.patch
index 51b6734..e6ef809 100644 (file)
@@ -47,3 +47,4 @@ rhel6.5/ext4-give-warning-with-dir-htree-growing.patch
 rhel6.6/ext4_s_max_ext_tree_depth.patch
 rhel6.5/ext4-fix-journal-quota.patch
 rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch
+rhel7/ext4-export-orphan-add.patch
index 70b2e61..3783f9a 100644 (file)
@@ -45,3 +45,4 @@ rhel6.5/ext4-give-warning-with-dir-htree-growing.patch
 rhel6.6/ext4_s_max_ext_tree_depth.patch
 rhel6.5/ext4-fix-journal-quota.patch
 rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch
+rhel7/ext4-export-orphan-add.patch
index 0c650e4..dddd020 100644 (file)
@@ -44,3 +44,4 @@ rhel6.3/ext4-notalloc_under_idatasem.patch
 rhel6.5/ext4-give-warning-with-dir-htree-growing.patch
 rhel6.6/ext4_s_max_ext_tree_depth.patch
 rhel6.5/ext4-fix-journal-quota.patch
+rhel7/ext4-export-orphan-add.patch
index 178e310..5dd8566 100644 (file)
@@ -43,3 +43,4 @@ rhel6.3/ext4-recalc-percpu-counters-after-journal.patch
 rhel6.3/ext4-notalloc_under_idatasem.patch
 rhel6.3/ext4-dont-check-in-ro.patch
 rhel6.3/ext4-dont-check-before-replay.patch
+rhel7/ext4-export-orphan-add.patch
index ffffacc..dcf6977 100644 (file)
@@ -43,3 +43,4 @@ rhel6.3/ext4-journal-path-opt.patch
 rhel6.3/ext4-recalc-percpu-counters-after-journal.patch
 rhel6.3/ext4-notalloc_under_idatasem.patch
 rhel6.5/ext4-fix-journal-quota.patch
+rhel7/ext4-export-orphan-add.patch
index dc96ece..a7ba22a 100644 (file)
@@ -40,3 +40,4 @@ rhel6.3/ext4-journal-path-opt.patch
 sles11sp3/ext4_s_max_ext_tree_depth.patch
 sles11sp1/ext4-notalloc_under_idatasem.patch
 rhel6.5/ext4-fix-journal-quota.patch
+rhel7/ext4-export-orphan-add.patch
index be27380..94be6ac 100644 (file)
@@ -44,3 +44,4 @@ sles11sp1/ext4-notalloc_under_idatasem.patch
 rhel6.5/ext4-fix-journal-quota.patch
 sles11sp3/ext4-dont-check-before-replay.patch
 rhel6.3/ext4-dont-check-in-ro.patch
+rhel7/ext4-export-orphan-add.patch
index 99096f2..bf645c6 100644 (file)
@@ -44,3 +44,4 @@ sles11sp1/ext4-notalloc_under_idatasem.patch
 rhel6.5/ext4-fix-journal-quota.patch
 sles11sp3/ext4-dont-check-before-replay.patch
 rhel6.3/ext4-dont-check-in-ro.patch
+rhel7/ext4-export-orphan-add.patch
index cee90e2..7e01b8a 100644 (file)
@@ -34,3 +34,4 @@ rhel7/ext4-cleanup-goto-next-group.patch
 rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch
 rhel7.2/ext4-preread-gd.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 4e1e863..cc5a856 100644 (file)
@@ -34,3 +34,4 @@ rhel7/ext4-cleanup-goto-next-group.patch
 rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch
 rhel7.2/ext4-preread-gd.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index e845452..98ebf46 100644 (file)
@@ -34,3 +34,4 @@ rhel7/ext4-cleanup-goto-next-group.patch
 rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch
 rhel7.2/ext4-preread-gd.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 8f2c5b7..6159b07 100644 (file)
@@ -33,3 +33,4 @@ rhel7/ext4-cleanup-goto-next-group.patch
 rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch
 rhel7.2/ext4-preread-gd.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 9cc8057..645bb3d 100644 (file)
@@ -29,3 +29,4 @@ rhel7/ext4-projid-xfs-ioctls.patch
 rhel7/ext4-fix-xattr-shifting-when-expanding-inodes.patch
 rhel7/ext4-cleanup-goto-next-group.patch
 rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch
+rhel7/ext4-export-orphan-add.patch
index 286fbf1..b3c7b95 100644 (file)
@@ -20,3 +20,4 @@ sles12/ext4-corrupted-inode-block-bitmaps-handling-patches.patch
 rhel7/ext4-give-warning-with-dir-htree-growing.patch
 rhel7/ext4-mmp-brelse.patch
 rhel7/ext4-jcb-optimization.patch
+rhel7/ext4-export-orphan-add.patch
index 03ce6e3..29f1110 100644 (file)
@@ -21,3 +21,4 @@ rhel7/ext4-give-warning-with-dir-htree-growing.patch
 rhel7/ext4-mmp-brelse.patch
 rhel7/ext4-jcb-optimization.patch
 sles12sp1/ext4-attach-jinode-in-writepages.patch
+rhel7/ext4-export-orphan-add.patch
index 1bbca1a..17c80e6 100644 (file)
@@ -25,3 +25,4 @@ sles12sp2/ext4-dont-check-before-replay.patch
 rhel7.2/ext4-dont-check-in-ro.patch
 sles12sp2/ext4-fix-xattr-shifting-when-expanding-inodes.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 3d5da29..bbe697c 100644 (file)
@@ -25,3 +25,4 @@ sles12sp2/ext4-dont-check-before-replay.patch
 rhel7.2/ext4-dont-check-in-ro.patch
 sles12sp2/ext4-fix-xattr-shifting-when-expanding-inodes.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 8b416d3..8ccdd27 100644 (file)
@@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch
 rhel7/ext4-jcb-optimization.patch
 sles12sp2/ext4-attach-jinode-in-writepages.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 6d9f4e0..62e3d82 100644 (file)
@@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch
 rhel7/ext4-jcb-optimization.patch
 sles12sp2/ext4-attach-jinode-in-writepages.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 7151e9a..27f8655 100644 (file)
@@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch
 rhel7/ext4-jcb-optimization.patch
 sles12sp2/ext4-attach-jinode-in-writepages.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 88ebda5..b552a0d 100644 (file)
@@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch
 rhel7/ext4-jcb-optimization.patch
 sles12sp2/ext4-attach-jinode-in-writepages.patch
 rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch
+rhel7/ext4-export-orphan-add.patch
index 0f34475..db8378b 100644 (file)
@@ -665,6 +665,8 @@ extern char obd_jobid_var[];
 #define OBD_FAIL_BARRIER_DELAY                 0x2202
 #define OBD_FAIL_BARRIER_FAILURE               0x2203
 
+#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE          0x2301
+
 /* Assign references to moved code to reduce code changes */
 #define OBD_FAIL_PRECHECK(id)                   CFS_FAIL_PRECHECK(id)
 #define OBD_FAIL_CHECK(id)                      CFS_FAIL_CHECK(id)
index 70ae175..8aaf426 100644 (file)
@@ -1710,6 +1710,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env,
        oh->ot_credits = 0;
        INIT_LIST_HEAD(&oh->ot_commit_dcb_list);
        INIT_LIST_HEAD(&oh->ot_stop_dcb_list);
+       INIT_LIST_HEAD(&oh->ot_trunc_locks);
        osd_th_alloced(oh);
 
        memset(oti->oti_declare_ops, 0,
@@ -1896,13 +1897,14 @@ static void osd_trans_stop_cb(struct osd_thandle *oth, int result)
 static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
                          struct thandle *th)
 {
-       int                     rc = 0, remove_agents = 0;
-       struct osd_thandle     *oh;
        struct osd_thread_info *oti = osd_oti_get(env);
-       struct osd_iobuf       *iobuf = &oti->oti_iobuf;
-       struct osd_device      *osd = osd_dt_dev(th->th_dev);
-       struct qsd_instance    *qsd = osd->od_quota_slave;
-       struct lquota_trans    *qtrans;
+       struct osd_thandle *oh;
+       struct osd_iobuf *iobuf = &oti->oti_iobuf;
+       struct osd_device *osd = osd_dt_dev(th->th_dev);
+       struct qsd_instance *qsd = osd->od_quota_slave;
+       struct lquota_trans *qtrans;
+       struct list_head truncates = LIST_HEAD_INIT(truncates);
+       int rc = 0, remove_agents = 0;
        ENTRY;
 
        oh = container_of0(th, struct osd_thandle, ot_super);
@@ -1912,6 +1914,9 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
        qtrans = oh->ot_quota_trans;
        oh->ot_quota_trans = NULL;
 
+       /* move locks to local list, stop tx, execute truncates */
+       list_splice(&oh->ot_trunc_locks, &truncates);
+
        if (oh->ot_handle != NULL) {
                int rc2;
                 handle_t *hdl = oh->ot_handle;
@@ -1943,11 +1948,15 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt,
                               osd_name(osd), rc2);
                if (!rc)
                        rc = rc2;
+
+               osd_process_truncates(&truncates);
        } else {
                osd_trans_stop_cb(oh, th->th_result);
                OBD_FREE_PTR(oh);
        }
 
+       osd_trunc_unlock_all(&truncates);
+
        /* inform the quota slave device that the transaction is stopping */
        qsd_op_end(env, qsd, qtrans);
 
index 4b2ca6a..3ea8ccf 100644 (file)
@@ -358,6 +358,13 @@ enum osd_op_type {
        OSD_OT_MAX              = 11
 };
 
+struct osd_access_lock {
+       struct list_head         tl_list;
+       struct osd_object       *tl_obj;
+       bool                     tl_shared;
+       bool                     tl_truncate;
+};
+
 struct osd_thandle {
         struct thandle          ot_super;
         handle_t               *ot_handle;
@@ -379,6 +386,7 @@ struct osd_thandle {
         /** time when this thanle was started */
        ktime_t oth_started;
 #endif
+       struct list_head        ot_trunc_locks;
 };
 
 /**
@@ -1397,4 +1405,10 @@ osd_index_backup(const struct lu_env *env, struct osd_device *osd, bool backup)
 
 #endif
 
+int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh,
+                  bool shared);
+void osd_trunc_unlock_all(struct list_head *list);
+void osd_process_truncates(struct list_head *list);
+void osd_execute_truncate(struct osd_object *obj);
+
 #endif /* _OSD_INTERNAL_H */
index 102ac81..95dfd8e 100644 (file)
@@ -1308,6 +1308,9 @@ static int osd_declare_write_commit(const struct lu_env *env,
        if (flags & QUOTA_FL_OVER_PRJQUOTA)
                lnb[0].lnb_flags |= OBD_BRW_OVER_PRJQUOTA;
 
+       if (rc == 0)
+               rc = osd_trunc_lock(osd_dt_obj(dt), oh, true);
+
        RETURN(rc);
 }
 
@@ -1733,6 +1736,10 @@ out:
                                           i_gid_read(inode),
                                           i_projid_read(inode), 0,
                                           oh, obj, NULL, OSD_QID_BLK);
+
+       if (rc == 0)
+               rc = osd_trunc_lock(obj, oh, true);
+
        RETURN(rc);
 }
 
@@ -1913,18 +1920,23 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt,
        rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode),
                                   i_projid_read(inode), 0, oh, osd_dt_obj(dt),
                                   NULL, OSD_QID_BLK);
+
+       if (rc == 0)
+               rc = osd_trunc_lock(osd_dt_obj(dt), oh, false);
+
        RETURN(rc);
 }
 
 static int osd_punch(const struct lu_env *env, struct dt_object *dt,
                     __u64 start, __u64 end, struct thandle *th)
 {
+       struct osd_object *obj = osd_dt_obj(dt);
+       struct osd_device *osd = osd_obj2dev(obj);
+       struct inode *inode = obj->oo_inode;
+       struct osd_access_lock *al;
        struct osd_thandle *oh;
-       struct osd_object  *obj = osd_dt_obj(dt);
-       struct inode       *inode = obj->oo_inode;
-       handle_t           *h;
-       tid_t               tid;
-       int                rc = 0, rc2 = 0;
+       int rc = 0, found = 0;
+       bool grow = false;
        ENTRY;
 
        LASSERT(end == OBD_OBJECT_EOF);
@@ -1937,49 +1949,51 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt,
        oh = container_of(th, struct osd_thandle, ot_super);
        LASSERT(oh->ot_handle->h_transaction != NULL);
 
-       osd_trans_exec_op(env, th, OSD_OT_PUNCH);
+       /* we used to skip truncate to current size to
+        * optimize truncates on OST. with DoM we can
+        * get attr_set to set specific size (MDS_REINT)
+        * and then get truncate RPC which essentially
+        * would be skipped. this is bad.. so, disable
+        * this optimization on MDS till the client stop
+        * to sent MDS_REINT (LU-11033) -bzzz */
+       if (osd->od_is_ost && i_size_read(inode) == start)
+               RETURN(0);
 
-       tid = oh->ot_handle->h_transaction->t_tid;
+       osd_trans_exec_op(env, th, OSD_OT_PUNCH);
 
        spin_lock(&inode->i_lock);
+       if (i_size_read(inode) < start)
+               grow = true;
        i_size_write(inode, start);
        spin_unlock(&inode->i_lock);
        ll_truncate_pagecache(inode, start);
-#ifdef HAVE_INODEOPS_TRUNCATE
-       if (inode->i_op->truncate) {
-               inode->i_op->truncate(inode);
-       } else
-#endif
-               ldiskfs_truncate(inode);
-
-       /*
-        * For a partial-page truncate, flush the page to disk immediately to
-        * avoid data corruption during direct disk write.  b=17397
-        */
-       if ((start & ~PAGE_MASK) != 0)
-                rc = filemap_fdatawrite_range(inode->i_mapping, start, start+1);
 
-        h = journal_current_handle();
-        LASSERT(h != NULL);
-        LASSERT(h == oh->ot_handle);
+       /* optimize grow case */
+       if (grow) {
+               osd_execute_truncate(obj);
+               GOTO(out, rc);
+       }
 
-       /* do not check credits with osd_trans_exec_check() as the truncate
-        * can restart the transaction internally and we restart the
-        * transaction in this case */
+       /* add to orphan list to ensure truncate completion
+        * if this transaction succeed. ldiskfs_truncate()
+        * will take the inode out of the list */
+       rc = ldiskfs_orphan_add(oh->ot_handle, inode);
+       if (rc != 0)
+               GOTO(out, rc);
 
-        if (tid != h->h_transaction->t_tid) {
-                int credits = oh->ot_credits;
-                /*
-                 * transaction has changed during truncate
-                 * we need to restart the handle with our credits
-                 */
-                if (h->h_buffer_credits < credits) {
-                        if (ldiskfs_journal_extend(h, credits))
-                                rc2 = ldiskfs_journal_restart(h, credits);
-                }
-        }
+       list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) {
+               if (obj != al->tl_obj)
+                       continue;
+               LASSERT(al->tl_shared == 0);
+               found = 1;
+               /* do actual truncate in osd_trans_stop() */
+               al->tl_truncate = 1;
+               break;
+       }
+       LASSERT(found);
 
-        RETURN(rc == 0 ? rc2 : rc);
+out:
+       RETURN(rc);
 }
 
 static int fiemap_check_ranges(struct inode *inode,
@@ -2092,3 +2106,111 @@ const struct dt_body_operations osd_body_ops = {
        .dbo_fiemap_get                 = osd_fiemap_get,
        .dbo_ladvise                    = osd_ladvise,
 };
+
+/**
+ * Get a truncate lock
+ *
+ * In order to take multi-transaction truncate out of main transaction we let
+ * the caller grab a lock on the object passed. the lock can be shared (for
+ * writes) and exclusive (for truncate). It's not allowed to mix truncate
+ * and write in the same transaction handle (do not confuse with big ldiskfs
+ * transaction containing lots of handles).
+ * The lock must be taken at declaration.
+ *
+ * \param obj          object to lock
+ * \oh                 transaction
+ * \shared             shared or exclusive
+ *
+ * \retval 0           lock is granted
+ * \retval -NOMEM      no memory to allocate lock
+ */
+int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, bool shared)
+{
+       struct osd_access_lock *al, *tmp;
+
+       LASSERT(obj);
+       LASSERT(oh);
+
+       list_for_each_entry(tmp, &oh->ot_trunc_locks, tl_list) {
+               if (tmp->tl_obj != obj)
+                       continue;
+               LASSERT(tmp->tl_shared == shared);
+               /* found same lock */
+               return 0;
+       }
+
+       OBD_ALLOC_PTR(al);
+       if (unlikely(al == NULL))
+               return -ENOMEM;
+       al->tl_obj = obj;
+       al->tl_truncate = false;
+       if (shared)
+               down_read(&obj->oo_ext_idx_sem);
+       else
+               down_write(&obj->oo_ext_idx_sem);
+       al->tl_shared = shared;
+
+       list_add(&al->tl_list, &oh->ot_trunc_locks);
+
+       return 0;
+}
+
+void osd_trunc_unlock_all(struct list_head *list)
+{
+       struct osd_access_lock *al, *tmp;
+       list_for_each_entry_safe(al, tmp, list, tl_list) {
+               if (al->tl_shared)
+                       up_read(&al->tl_obj->oo_ext_idx_sem);
+               else
+                       up_write(&al->tl_obj->oo_ext_idx_sem);
+               list_del(&al->tl_list);
+               OBD_FREE_PTR(al);
+       }
+}
+
+void osd_execute_truncate(struct osd_object *obj)
+{
+       struct inode *inode = obj->oo_inode;
+       __u64 size;
+
+       /* simulate crash before (in the middle) of delayed truncate */
+       if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FAIL_AT_TRUNCATE)) {
+               struct ldiskfs_inode_info *ei = LDISKFS_I(inode);
+               struct ldiskfs_sb_info *sbi = LDISKFS_SB(inode->i_sb);
+
+               mutex_lock(&sbi->s_orphan_lock);
+               list_del_init(&ei->i_orphan);
+               mutex_unlock(&sbi->s_orphan_lock);
+               return;
+       }
+
+#ifdef HAVE_INODEOPS_TRUNCATE
+       if (inode->i_op->truncate)
+               inode->i_op->truncate(inode);
+       else
+#endif
+               ldiskfs_truncate(inode);
+
+       /*
+        * For a partial-page truncate, flush the page to disk immediately to
+        * avoid data corruption during direct disk write.  b=17397
+        */
+       size = i_size_read(inode);
+       if ((size & ~PAGE_MASK) != 0)
+               filemap_fdatawrite_range(inode->i_mapping, size, size + 1);
+}
+
+void osd_process_truncates(struct list_head *list)
+{
+       struct osd_access_lock *al;
+
+       LASSERT(journal_current_handle() == NULL);
+
+       list_for_each_entry(al, list, tl_list) {
+               if (al->tl_shared)
+                       continue;
+               if (!al->tl_truncate)
+                       continue;
+               osd_execute_truncate(al->tl_obj);
+       }
+}
index bf4956f..9e3a268 100755 (executable)
@@ -16433,6 +16433,45 @@ test_258b() {
 }
 run_test 258b "verify i_mutex security behavior"
 
+test_259() {
+       local file=$DIR/$tfile
+       local before
+       local after
+
+       [ "$(facet_fstype mds1)" != "ldiskfs" ] &&
+               skip "ldiskfs only test" && return
+
+       stack_trap "rm -f $file" EXIT
+
+       wait_delete_completed
+       before=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree")
+       echo "before: $before"
+
+       $LFS setstripe -i 0 -c 1 $file
+       dd if=/dev/zero of=$file bs=1M count=10 || error "couldn't write"
+       sync_all_data
+       after=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree")
+       echo "after write: $after"
+
+#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE          0x2301
+       do_facet ost1 $LCTL set_param fail_loc=0x2301
+       $TRUNCATE $file 0
+       after=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree")
+       echo "after truncate: $after"
+
+       stop ost1
+       do_facet ost1 $LCTL set_param fail_loc=0
+       start ost1 $(ostdevname 1) $OST_MOUNT_OPTS || error "cannot start ost1"
+       sleep 2
+       after=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree")
+       echo "after restart: $after"
+       [ $((after - before)) -ge $(fs_log_size ost1) ] &&
+               error "missing truncate?"
+
+       return 0
+}
+run_test 259 "crash at delayed truncate"
+
 test_260() {
 #define OBD_FAIL_MDC_CLOSE               0x806
        $LCTL set_param fail_loc=0x80000806