From: Alex Zhuravlev Date: Wed, 7 Jun 2017 13:32:39 +0000 (+0400) Subject: LU-10048 osd: async truncate X-Git-Tag: 2.11.53~46 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=cf29a5e7bfa254ccfcea023028fe7da80503c512 LU-10048 osd: async truncate osd-ldiskfs should execute truncate outside of main transaction handle. This avoids restarting truncate transaction handles in main transaction, and allows "transaction first, locking second" model on OST. Change-Id: Iffe45c42834c26ca72b65e068ad25ac61d0607c8 Signed-off-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/27488 Tested-by: Jenkins Reviewed-by: Andreas Dilger Tested-by: Maloo Reviewed-by: Fan Yong --- diff --git a/ldiskfs/kernel_patches/patches/rhel7/ext4-export-orphan-add.patch b/ldiskfs/kernel_patches/patches/rhel7/ext4-export-orphan-add.patch new file mode 100644 index 0000000..8ccd1cd --- /dev/null +++ b/ldiskfs/kernel_patches/patches/rhel7/ext4-export-orphan-add.patch @@ -0,0 +1,13 @@ +Index: linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c +=================================================================== +--- linux-3.10.0-123.13.2.el7.x86_64.orig/fs/ext4/namei.c ++++ linux-3.10.0-123.13.2.el7.x86_64/fs/ext4/namei.c +@@ -3250,6 +48,7 @@ int ext4_orphan_add(handle_t *handle + ext4_std_error(sb, err); + return err; + } ++EXPORT_SYMBOL(ext4_orphan_add); + + /* + * ext4_orphan_del() removes an unlinked or truncated inode from the list + diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series index 1947efd..412374d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.4.series @@ -44,3 +44,4 @@ rhel6.3/ext4-drop-inode-from-orphan-list-if-ext4_delete_inode-fails.patch rhel6.3/ext4-notalloc_under_idatasem.patch rhel6.3/ext4-dont-check-in-ro.patch rhel6.3/ext4-dont-check-before-replay.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series index e662543..aca98ab 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.5.series @@ -48,3 +48,4 @@ rhel6.5/ext4-fix-journal-quota.patch rhel6.3/ext4-dont-check-in-ro.patch rhel6.3/ext4-dont-check-before-replay.patch rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series index 84962a9..14a2e6c 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.6.series @@ -48,3 +48,4 @@ rhel6.5/ext4-give-warning-with-dir-htree-growing.patch rhel6.6/ext4_s_max_ext_tree_depth.patch rhel6.5/ext4-fix-journal-quota.patch rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.7.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.7.series index 51b6734..e6ef809 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.7.series @@ -47,3 +47,4 @@ rhel6.5/ext4-give-warning-with-dir-htree-growing.patch rhel6.6/ext4_s_max_ext_tree_depth.patch rhel6.5/ext4-fix-journal-quota.patch rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.8.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.8.series index 70b2e61..3783f9a 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.8.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.8.series @@ -45,3 +45,4 @@ rhel6.5/ext4-give-warning-with-dir-htree-growing.patch rhel6.6/ext4_s_max_ext_tree_depth.patch rhel6.5/ext4-fix-journal-quota.patch rhel6.5/ext4-brackets-in-ext4-remove-blocks.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.9.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.9.series index 0c650e4..dddd020 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.9.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.9.series @@ -44,3 +44,4 @@ rhel6.3/ext4-notalloc_under_idatasem.patch rhel6.5/ext4-give-warning-with-dir-htree-growing.patch rhel6.6/ext4_s_max_ext_tree_depth.patch rhel6.5/ext4-fix-journal-quota.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series index 178e310..5dd8566 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-rhel6.series @@ -43,3 +43,4 @@ rhel6.3/ext4-recalc-percpu-counters-after-journal.patch rhel6.3/ext4-notalloc_under_idatasem.patch rhel6.3/ext4-dont-check-in-ro.patch rhel6.3/ext4-dont-check-before-replay.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series index ffffacc..dcf6977 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-2.6-sles11.series @@ -43,3 +43,4 @@ rhel6.3/ext4-journal-path-opt.patch rhel6.3/ext4-recalc-percpu-counters-after-journal.patch rhel6.3/ext4-notalloc_under_idatasem.patch rhel6.5/ext4-fix-journal-quota.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series index dc96ece..a7ba22a 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11.series @@ -40,3 +40,4 @@ rhel6.3/ext4-journal-path-opt.patch sles11sp3/ext4_s_max_ext_tree_depth.patch sles11sp1/ext4-notalloc_under_idatasem.patch rhel6.5/ext4-fix-journal-quota.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series index be27380..94be6ac 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp3.series @@ -44,3 +44,4 @@ sles11sp1/ext4-notalloc_under_idatasem.patch rhel6.5/ext4-fix-journal-quota.patch sles11sp3/ext4-dont-check-before-replay.patch rhel6.3/ext4-dont-check-in-ro.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp4.series b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp4.series index 99096f2..bf645c6 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.0-sles11sp4.series @@ -44,3 +44,4 @@ sles11sp1/ext4-notalloc_under_idatasem.patch rhel6.5/ext4-fix-journal-quota.patch sles11sp3/ext4-dont-check-before-replay.patch rhel6.3/ext4-dont-check-in-ro.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series index cee90e2..7e01b8a 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.2.series @@ -34,3 +34,4 @@ rhel7/ext4-cleanup-goto-next-group.patch rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7.2/ext4-preread-gd.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series index 4e1e863..cc5a856 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.3.series @@ -34,3 +34,4 @@ rhel7/ext4-cleanup-goto-next-group.patch rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7.2/ext4-preread-gd.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series index e845452..98ebf46 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.4.series @@ -34,3 +34,4 @@ rhel7/ext4-cleanup-goto-next-group.patch rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7.2/ext4-preread-gd.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.5.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.5.series index 8f2c5b7..6159b07 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.5.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.5.series @@ -33,3 +33,4 @@ rhel7/ext4-cleanup-goto-next-group.patch rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch rhel7.2/ext4-preread-gd.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series index 9cc8057..645bb3d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.10-rhel7.series @@ -29,3 +29,4 @@ rhel7/ext4-projid-xfs-ioctls.patch rhel7/ext4-fix-xattr-shifting-when-expanding-inodes.patch rhel7/ext4-cleanup-goto-next-group.patch rhel7/ext4-reduce-lock-contention-in-__ext4_new_inode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12.series b/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12.series index 286fbf1..b3c7b95 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12.series @@ -20,3 +20,4 @@ sles12/ext4-corrupted-inode-block-bitmaps-handling-patches.patch rhel7/ext4-give-warning-with-dir-htree-growing.patch rhel7/ext4-mmp-brelse.patch rhel7/ext4-jcb-optimization.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12sp1.series b/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12sp1.series index 03ce6e3..29f1110 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12sp1.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-3.12-sles12sp1.series @@ -21,3 +21,4 @@ rhel7/ext4-give-warning-with-dir-htree-growing.patch rhel7/ext4-mmp-brelse.patch rhel7/ext4-jcb-optimization.patch sles12sp1/ext4-attach-jinode-in-writepages.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp2.series b/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp2.series index 1bbca1a..17c80e6 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp2.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp2.series @@ -25,3 +25,4 @@ sles12sp2/ext4-dont-check-before-replay.patch rhel7.2/ext4-dont-check-in-ro.patch sles12sp2/ext4-fix-xattr-shifting-when-expanding-inodes.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp3.series b/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp3.series index 3d5da29..bbe697c 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp3.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.4-sles12sp3.series @@ -25,3 +25,4 @@ sles12sp2/ext4-dont-check-before-replay.patch rhel7.2/ext4-dont-check-in-ro.patch sles12sp2/ext4-fix-xattr-shifting-when-expanding-inodes.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-45-ubuntu14+16.series b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-45-ubuntu14+16.series index 8b416d3..8ccdd27 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-45-ubuntu14+16.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-45-ubuntu14+16.series @@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch rhel7/ext4-jcb-optimization.patch sles12sp2/ext4-attach-jinode-in-writepages.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-49-ubuntu14+16.series b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-49-ubuntu14+16.series index 6d9f4e0..62e3d82 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-49-ubuntu14+16.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-49-ubuntu14+16.series @@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch rhel7/ext4-jcb-optimization.patch sles12sp2/ext4-attach-jinode-in-writepages.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-62-ubuntu14+16.series b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-62-ubuntu14+16.series index 7151e9a..27f8655 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-62-ubuntu14+16.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-62-ubuntu14+16.series @@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch rhel7/ext4-jcb-optimization.patch sles12sp2/ext4-attach-jinode-in-writepages.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-73-ubuntu14+16.series b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-73-ubuntu14+16.series index 88ebda5..b552a0d 100644 --- a/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-73-ubuntu14+16.series +++ b/ldiskfs/kernel_patches/series/ldiskfs-4.4.0-73-ubuntu14+16.series @@ -22,3 +22,4 @@ sles12sp2/ext4-mmp-brelse.patch rhel7/ext4-jcb-optimization.patch sles12sp2/ext4-attach-jinode-in-writepages.patch rhel7/ext4-use-GFP_NOFS-in-ext4_inode_attach_jinode.patch +rhel7/ext4-export-orphan-add.patch diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 0f34475..db8378b 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -665,6 +665,8 @@ extern char obd_jobid_var[]; #define OBD_FAIL_BARRIER_DELAY 0x2202 #define OBD_FAIL_BARRIER_FAILURE 0x2203 +#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE 0x2301 + /* Assign references to moved code to reduce code changes */ #define OBD_FAIL_PRECHECK(id) CFS_FAIL_PRECHECK(id) #define OBD_FAIL_CHECK(id) CFS_FAIL_CHECK(id) diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 70ae175..8aaf426 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1710,6 +1710,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, oh->ot_credits = 0; INIT_LIST_HEAD(&oh->ot_commit_dcb_list); INIT_LIST_HEAD(&oh->ot_stop_dcb_list); + INIT_LIST_HEAD(&oh->ot_trunc_locks); osd_th_alloced(oh); memset(oti->oti_declare_ops, 0, @@ -1896,13 +1897,14 @@ static void osd_trans_stop_cb(struct osd_thandle *oth, int result) static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, struct thandle *th) { - int rc = 0, remove_agents = 0; - struct osd_thandle *oh; struct osd_thread_info *oti = osd_oti_get(env); - struct osd_iobuf *iobuf = &oti->oti_iobuf; - struct osd_device *osd = osd_dt_dev(th->th_dev); - struct qsd_instance *qsd = osd->od_quota_slave; - struct lquota_trans *qtrans; + struct osd_thandle *oh; + struct osd_iobuf *iobuf = &oti->oti_iobuf; + struct osd_device *osd = osd_dt_dev(th->th_dev); + struct qsd_instance *qsd = osd->od_quota_slave; + struct lquota_trans *qtrans; + struct list_head truncates = LIST_HEAD_INIT(truncates); + int rc = 0, remove_agents = 0; ENTRY; oh = container_of0(th, struct osd_thandle, ot_super); @@ -1912,6 +1914,9 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, qtrans = oh->ot_quota_trans; oh->ot_quota_trans = NULL; + /* move locks to local list, stop tx, execute truncates */ + list_splice(&oh->ot_trunc_locks, &truncates); + if (oh->ot_handle != NULL) { int rc2; handle_t *hdl = oh->ot_handle; @@ -1943,11 +1948,15 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, osd_name(osd), rc2); if (!rc) rc = rc2; + + osd_process_truncates(&truncates); } else { osd_trans_stop_cb(oh, th->th_result); OBD_FREE_PTR(oh); } + osd_trunc_unlock_all(&truncates); + /* inform the quota slave device that the transaction is stopping */ qsd_op_end(env, qsd, qtrans); diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index 4b2ca6a..3ea8ccf 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -358,6 +358,13 @@ enum osd_op_type { OSD_OT_MAX = 11 }; +struct osd_access_lock { + struct list_head tl_list; + struct osd_object *tl_obj; + bool tl_shared; + bool tl_truncate; +}; + struct osd_thandle { struct thandle ot_super; handle_t *ot_handle; @@ -379,6 +386,7 @@ struct osd_thandle { /** time when this thanle was started */ ktime_t oth_started; #endif + struct list_head ot_trunc_locks; }; /** @@ -1397,4 +1405,10 @@ osd_index_backup(const struct lu_env *env, struct osd_device *osd, bool backup) #endif +int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, + bool shared); +void osd_trunc_unlock_all(struct list_head *list); +void osd_process_truncates(struct list_head *list); +void osd_execute_truncate(struct osd_object *obj); + #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index 102ac81..95dfd8e 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1308,6 +1308,9 @@ static int osd_declare_write_commit(const struct lu_env *env, if (flags & QUOTA_FL_OVER_PRJQUOTA) lnb[0].lnb_flags |= OBD_BRW_OVER_PRJQUOTA; + if (rc == 0) + rc = osd_trunc_lock(osd_dt_obj(dt), oh, true); + RETURN(rc); } @@ -1733,6 +1736,10 @@ out: i_gid_read(inode), i_projid_read(inode), 0, oh, obj, NULL, OSD_QID_BLK); + + if (rc == 0) + rc = osd_trunc_lock(obj, oh, true); + RETURN(rc); } @@ -1913,18 +1920,23 @@ static int osd_declare_punch(const struct lu_env *env, struct dt_object *dt, rc = osd_declare_inode_qid(env, i_uid_read(inode), i_gid_read(inode), i_projid_read(inode), 0, oh, osd_dt_obj(dt), NULL, OSD_QID_BLK); + + if (rc == 0) + rc = osd_trunc_lock(osd_dt_obj(dt), oh, false); + RETURN(rc); } static int osd_punch(const struct lu_env *env, struct dt_object *dt, __u64 start, __u64 end, struct thandle *th) { + struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *osd = osd_obj2dev(obj); + struct inode *inode = obj->oo_inode; + struct osd_access_lock *al; struct osd_thandle *oh; - struct osd_object *obj = osd_dt_obj(dt); - struct inode *inode = obj->oo_inode; - handle_t *h; - tid_t tid; - int rc = 0, rc2 = 0; + int rc = 0, found = 0; + bool grow = false; ENTRY; LASSERT(end == OBD_OBJECT_EOF); @@ -1937,49 +1949,51 @@ static int osd_punch(const struct lu_env *env, struct dt_object *dt, oh = container_of(th, struct osd_thandle, ot_super); LASSERT(oh->ot_handle->h_transaction != NULL); - osd_trans_exec_op(env, th, OSD_OT_PUNCH); + /* we used to skip truncate to current size to + * optimize truncates on OST. with DoM we can + * get attr_set to set specific size (MDS_REINT) + * and then get truncate RPC which essentially + * would be skipped. this is bad.. so, disable + * this optimization on MDS till the client stop + * to sent MDS_REINT (LU-11033) -bzzz */ + if (osd->od_is_ost && i_size_read(inode) == start) + RETURN(0); - tid = oh->ot_handle->h_transaction->t_tid; + osd_trans_exec_op(env, th, OSD_OT_PUNCH); spin_lock(&inode->i_lock); + if (i_size_read(inode) < start) + grow = true; i_size_write(inode, start); spin_unlock(&inode->i_lock); ll_truncate_pagecache(inode, start); -#ifdef HAVE_INODEOPS_TRUNCATE - if (inode->i_op->truncate) { - inode->i_op->truncate(inode); - } else -#endif - ldiskfs_truncate(inode); - - /* - * For a partial-page truncate, flush the page to disk immediately to - * avoid data corruption during direct disk write. b=17397 - */ - if ((start & ~PAGE_MASK) != 0) - rc = filemap_fdatawrite_range(inode->i_mapping, start, start+1); - h = journal_current_handle(); - LASSERT(h != NULL); - LASSERT(h == oh->ot_handle); + /* optimize grow case */ + if (grow) { + osd_execute_truncate(obj); + GOTO(out, rc); + } - /* do not check credits with osd_trans_exec_check() as the truncate - * can restart the transaction internally and we restart the - * transaction in this case */ + /* add to orphan list to ensure truncate completion + * if this transaction succeed. ldiskfs_truncate() + * will take the inode out of the list */ + rc = ldiskfs_orphan_add(oh->ot_handle, inode); + if (rc != 0) + GOTO(out, rc); - if (tid != h->h_transaction->t_tid) { - int credits = oh->ot_credits; - /* - * transaction has changed during truncate - * we need to restart the handle with our credits - */ - if (h->h_buffer_credits < credits) { - if (ldiskfs_journal_extend(h, credits)) - rc2 = ldiskfs_journal_restart(h, credits); - } - } + list_for_each_entry(al, &oh->ot_trunc_locks, tl_list) { + if (obj != al->tl_obj) + continue; + LASSERT(al->tl_shared == 0); + found = 1; + /* do actual truncate in osd_trans_stop() */ + al->tl_truncate = 1; + break; + } + LASSERT(found); - RETURN(rc == 0 ? rc2 : rc); +out: + RETURN(rc); } static int fiemap_check_ranges(struct inode *inode, @@ -2092,3 +2106,111 @@ const struct dt_body_operations osd_body_ops = { .dbo_fiemap_get = osd_fiemap_get, .dbo_ladvise = osd_ladvise, }; + +/** + * Get a truncate lock + * + * In order to take multi-transaction truncate out of main transaction we let + * the caller grab a lock on the object passed. the lock can be shared (for + * writes) and exclusive (for truncate). It's not allowed to mix truncate + * and write in the same transaction handle (do not confuse with big ldiskfs + * transaction containing lots of handles). + * The lock must be taken at declaration. + * + * \param obj object to lock + * \oh transaction + * \shared shared or exclusive + * + * \retval 0 lock is granted + * \retval -NOMEM no memory to allocate lock + */ +int osd_trunc_lock(struct osd_object *obj, struct osd_thandle *oh, bool shared) +{ + struct osd_access_lock *al, *tmp; + + LASSERT(obj); + LASSERT(oh); + + list_for_each_entry(tmp, &oh->ot_trunc_locks, tl_list) { + if (tmp->tl_obj != obj) + continue; + LASSERT(tmp->tl_shared == shared); + /* found same lock */ + return 0; + } + + OBD_ALLOC_PTR(al); + if (unlikely(al == NULL)) + return -ENOMEM; + al->tl_obj = obj; + al->tl_truncate = false; + if (shared) + down_read(&obj->oo_ext_idx_sem); + else + down_write(&obj->oo_ext_idx_sem); + al->tl_shared = shared; + + list_add(&al->tl_list, &oh->ot_trunc_locks); + + return 0; +} + +void osd_trunc_unlock_all(struct list_head *list) +{ + struct osd_access_lock *al, *tmp; + list_for_each_entry_safe(al, tmp, list, tl_list) { + if (al->tl_shared) + up_read(&al->tl_obj->oo_ext_idx_sem); + else + up_write(&al->tl_obj->oo_ext_idx_sem); + list_del(&al->tl_list); + OBD_FREE_PTR(al); + } +} + +void osd_execute_truncate(struct osd_object *obj) +{ + struct inode *inode = obj->oo_inode; + __u64 size; + + /* simulate crash before (in the middle) of delayed truncate */ + if (OBD_FAIL_CHECK(OBD_FAIL_OSD_FAIL_AT_TRUNCATE)) { + struct ldiskfs_inode_info *ei = LDISKFS_I(inode); + struct ldiskfs_sb_info *sbi = LDISKFS_SB(inode->i_sb); + + mutex_lock(&sbi->s_orphan_lock); + list_del_init(&ei->i_orphan); + mutex_unlock(&sbi->s_orphan_lock); + return; + } + +#ifdef HAVE_INODEOPS_TRUNCATE + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + else +#endif + ldiskfs_truncate(inode); + + /* + * For a partial-page truncate, flush the page to disk immediately to + * avoid data corruption during direct disk write. b=17397 + */ + size = i_size_read(inode); + if ((size & ~PAGE_MASK) != 0) + filemap_fdatawrite_range(inode->i_mapping, size, size + 1); +} + +void osd_process_truncates(struct list_head *list) +{ + struct osd_access_lock *al; + + LASSERT(journal_current_handle() == NULL); + + list_for_each_entry(al, list, tl_list) { + if (al->tl_shared) + continue; + if (!al->tl_truncate) + continue; + osd_execute_truncate(al->tl_obj); + } +} diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index bf4956f..9e3a268 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -16433,6 +16433,45 @@ test_258b() { } run_test 258b "verify i_mutex security behavior" +test_259() { + local file=$DIR/$tfile + local before + local after + + [ "$(facet_fstype mds1)" != "ldiskfs" ] && + skip "ldiskfs only test" && return + + stack_trap "rm -f $file" EXIT + + wait_delete_completed + before=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree") + echo "before: $before" + + $LFS setstripe -i 0 -c 1 $file + dd if=/dev/zero of=$file bs=1M count=10 || error "couldn't write" + sync_all_data + after=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree") + echo "after write: $after" + +#define OBD_FAIL_OSD_FAIL_AT_TRUNCATE 0x2301 + do_facet ost1 $LCTL set_param fail_loc=0x2301 + $TRUNCATE $file 0 + after=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree") + echo "after truncate: $after" + + stop ost1 + do_facet ost1 $LCTL set_param fail_loc=0 + start ost1 $(ostdevname 1) $OST_MOUNT_OPTS || error "cannot start ost1" + sleep 2 + after=$(do_facet ost1 "$LCTL get_param -n osd-*.*OST0000.kbytesfree") + echo "after restart: $after" + [ $((after - before)) -ge $(fs_log_size ost1) ] && + error "missing truncate?" + + return 0 +} +run_test 259 "crash at delayed truncate" + test_260() { #define OBD_FAIL_MDC_CLOSE 0x806 $LCTL set_param fail_loc=0x80000806