From 5cdd444502f6554588b7cb19ed9f727ca9946b28 Mon Sep 17 00:00:00 2001 From: Alex Zhuravlev Date: Tue, 6 May 2025 16:11:47 -0700 Subject: [PATCH] LU-13132 osd: osd-zfs to cache dbufs for llog objects working set for llog objects is tiny and very predictable. osd-zfs can cache couple dbufs (first block storing the header and last block for new records). for sanity/60a (llog test) it gives 5939307 hits and 5776 misses while average osd_write() goes down from 1.09 usec to 0.27 usec, total time for sanity/60a: before - 153s, after - 101s. this approach can be used in few other cases like last_rcvd. Lustre-change: https://review.whamcloud.com/37222 Lustre-commit: 11a89c5ec16685fda91dd7c052b72012833c2f88 Was-Change-Id: Icc0126658894085d33ef79ae41ac6c1ed4140f4c LU-16479 utils: Add option to manage degraded ZFS OST Add new Lustre specific ZFS dataset user property to control/manage degraded ZFS OSTs, also modify the existing lustre/scripts/statechange-lustre.sh zedlet accordingly. Extend the same to mkfs.lustre utility to add this property by default when creating a new Lustre ZFS server. Lustre-change: https://review.whamcloud.com/49660 Lustre-commit: a2de6af65d21bff0d9357c30e6eb4ba049ff2059 Was-Change-Id: I7032538f507c9ad20d5b109b54e3c3bab8138458 HPE-bug-id: LUS-11447 Signed-off-by: Akash B LU-14918 osd: don't declare similar zfs writes twice in some cases (like overstriping) the same operations can be declared multiple times (new llog records) and this lead to huge number of credits and performance degradation. we can avoid this checking for duplicate declarations. notice each declare operation results in a allocation in ZFS. the example for an overstriped file (2000 stripes over 4 OSTs), declare ops before after create: 2001 2 unlink: 10001 10 creation of 1K-stripe files (over 4 OSTs) is 2.5% faster. removal of 1K-stripe files is 44% faster. single-stripe file creation/removal does not degrade. Lustre-change: https://review.whamcloud.com/49701 Lustre-commit: c1936c9d294d53ff39741e1b07ffc74f51fcddb6 Was-Change-Id: I5d9e6d3a1574ccd7bf97fd3a67ab4fff0b6a352c LU-14918 osd: don't declare similar ldiskfs writes twice in some cases (like overstriping) the same operations can be declared multiple times (new llog records) and this lead to huge number of credits and performance degradation. we can avoid this checking for duplicate declarations. As every declaration would need an allocation, limit the scope of this checks to transaction likely to be large. % of "large" transaction in sanity-benchmark, depending on threshold: creates < 5 && writes < 5: 0.58% (mds1) and 2.97% (mds2) create < 7 & writes < 7: 0.58% and 2.4% create < 9 & writes < 9: 0.6% and 1.85% create < 10 & write2 < 10: 0.0004% and 0.000001% thus 10 creates or writes is selected as a threshold to enable this logic. Lustre-change: https://review.whamcloud.com/45765 Lustre-commit: 9e6225b2e7385cbb7be0474df01075fafc4966d5 Was-Change-Id: I7c893fe3b95646b4b813b999bc832659dfcf03ad LU-15642 obdclass: use consistent stats units Use consistent stats units, since some were "usec" and others "usecs". Most stats already use LPROCFS_TYPE_* to encode type stats type, so use this to provide units for those stats, and only explicitly provide strings for the few stats that don't match the commonly-used units. This also reduces the number of repeat static strings in the modules. Lustre-change: https://review.whamcloud.com/46833 Lustre-commit: b515c6ec2ab84598c77c65eb78f1afd5e67b1ede Was-Change-Id: I25f31478f238072ddbf9a3918cd43bb08c3ebbe5 Signed-off-by: Andreas Dilger LU-16150 zfs: Fix ZFS(2.1.99-1) build error on CentOS (3.10) ZFS: (2.1.99-1) Lustre: 27723374a38 LU-16073 utils: double snapshot_mount fix CentOS: 3.10.0-1160.15.2.el7.x86_64 This patch fixes build failures seens as below for the above configuration: First: make[4]: Entering directory `/root/lustre01/lustre-release/lustre/utils' gcc -rdynamic -shared -export-dynamic -pthread \ -L/root/zfs/zfs_git_lustre_build/zfs//lib/libzfs/.libs/ -L/root/zfs/zfs_git_lustre_build/zfs//lib/libnvpair/.libs/ -L/root/zfs/zfs_git_lustre_build/zfs//lib/libzpool/.libs/ -o mount_osd_zfs.so \ `ar -t libmount_utils_zfs.a` \ -ldl -lzfs -lnvpair -lzpool /usr/bin/ld: cannot find -lzfs /usr/bin/ld: cannot find -lnvpair /usr/bin/ld: cannot find -lzpool collect2: error: ld returned 1 exit status Lustre-change: https://review.whamcloud.com/48536 Lustre-commit: 448963c9a33dbf0e0988ceeb407027f2488e7f42 Was-Change-Id: I32f270c7912379f7dce940e0aa2bceee5e49ad79 Signed-off-by: Arshad Hussain LU-15611 osd-zfs: Cleanup while mount failed Need clean up in error out path in osd-zfs. Lustre-change: https://review.whamcloud.com/46678 Lustre-commit: 9b973ad37f66a10eb7db1ced6865708497ecc02b Was-Change-Id: I47d9ee9483acb8e1d60c77e8cfc481902a1535ac Signed-off-by: Yang Sheng Test-Parameters: optional fstype=zfs mdtcount=4 mdscount=2 \ clientdistro=el9.5 serverdistro=el8.10 testgroup=full-dne-zfs-part-1 Test-Parameters: optional fstype=zfs mdtcount=4 mdscount=2 \ clientdistro=el9.5 serverdistro=el8.10 testgroup=full-dne-zfs-part-2 Test-Parameters: optional fstype=zfs mdtcount=4 mdscount=2 \ clientdistro=el9.5 serverdistro=el8.10 testgroup=full-dne-zfs-part-3 Change-Id: Icc0126658894085d33ef79ae41ac6c1ed4140f4c Signed-off-by: Alex Zhuravlev Reviewed-on: https://review.whamcloud.com/c/fs/lustre-release/+/59123 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Yang Sheng Reviewed-by: Oleg Drokin --- config/lustre-build-zfs.m4 | 2 +- contrib/lbuild/lbuild | 6 +- lustre/include/lprocfs_status.h | 49 +++++---- lustre/include/uapi/linux/lustre/lustre_param.h | 1 + lustre/ldlm/ldlm_pool.c | 34 ++++--- lustre/ldlm/ldlm_resource.c | 3 +- lustre/llite/lproc_llite.c | 33 +++--- lustre/mdt/mdt_lproc.c | 7 +- lustre/mgs/lproc_mgs.c | 24 ++--- lustre/mgs/mgs_llog.c | 4 + lustre/obdclass/class_obd.c | 4 +- lustre/obdclass/dt_object.c | 9 +- lustre/obdclass/llog_osd.c | 7 +- lustre/obdclass/lprocfs_status.c | 82 ++++++++++----- lustre/obdclass/lu_object.c | 27 ++--- lustre/obdecho/echo.c | 8 +- lustre/ofd/lproc_ofd.c | 30 +++--- lustre/osd-ldiskfs/osd_handler.c | 73 ++++++++++++++ lustre/osd-ldiskfs/osd_internal.h | 16 +++ lustre/osd-ldiskfs/osd_io.c | 3 + lustre/osd-ldiskfs/osd_lproc.c | 66 ++++++------ lustre/osd-zfs/osd_handler.c | 124 ++++++++++++----------- lustre/osd-zfs/osd_internal.h | 5 + lustre/osd-zfs/osd_io.c | 129 ++++++++++++++++++++++-- lustre/osd-zfs/osd_lproc.c | 44 ++++---- lustre/osd-zfs/osd_object.c | 10 ++ lustre/ptlrpc/lproc_ptlrpc.c | 32 +++--- lustre/scripts/statechange-lustre.sh | 9 +- lustre/tests/recovery-small.sh | 4 +- lustre/tests/sanity.sh | 33 ++++++ lustre/utils/mkfs_lustre.c | 7 ++ 31 files changed, 598 insertions(+), 287 deletions(-) diff --git a/config/lustre-build-zfs.m4 b/config/lustre-build-zfs.m4 index 455aa10..02e791f 100644 --- a/config/lustre-build-zfs.m4 +++ b/config/lustre-build-zfs.m4 @@ -331,7 +331,7 @@ AC_DEFUN([LB_ZFS_USER], [ AS_IF([test -z "${zfsinc}"], [ AS_IF([test -e "${zfssrc}/include/libzfs.h" && test -e "${zfssrc}/lib/libspl/include"], [ zfsinc="-I $zfssrc/lib/libspl/include -I $zfssrc/lib/libspl/include/os/linux -I $zfssrc/include" - zfslib="-L$zfssrc/lib/libzfs/.libs/ -L$zfssrc/lib/libnvpair/.libs/ -L$zfssrc/lib/libzpool/.libs/" + zfslib="-L$zfssrc/.libs/ -L$zfssrc/lib/libzfs/.libs/ -L$zfssrc/lib/libnvpair/.libs/ -L$zfssrc/lib/libzpool/.libs/" ], [test -d /usr/include/libzfs && test -d /usr/include/libspl], [ zfsinc="-I/usr/include/libspl -I /usr/include/libzfs" zfslib="" diff --git a/contrib/lbuild/lbuild b/contrib/lbuild/lbuild index 58a2bf8..b08bf12 100755 --- a/contrib/lbuild/lbuild +++ b/contrib/lbuild/lbuild @@ -1066,7 +1066,11 @@ build_spl_zfs() { speclist="$pkg.spec $rpmpkg.spec" else rpmpkg=kmod-$pkg-devel - specdir=rpm/generic + if [[ $DISTROMAJ > "rhel8" ]]; then + specdir=rpm/redhat + else + specdir=rpm/generic + fi speclist="$pkg.spec $pkg-kmod.spec $pkg-dkms.spec" fi diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 68912fb..7cf9b8e 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -140,18 +140,22 @@ struct rename_stats { * multiply per counter increment. */ -enum { +enum lprocfs_counter_config { LPROCFS_CNTR_EXTERNALLOCK = 0x0001, LPROCFS_CNTR_AVGMINMAX = 0x0002, LPROCFS_CNTR_STDDEV = 0x0004, - /* counter data type */ - LPROCFS_TYPE_REQS = 0x0100, + /* counter unit type */ + LPROCFS_TYPE_REQS = 0x0000, /* default if config = 0 */ LPROCFS_TYPE_BYTES = 0x0200, LPROCFS_TYPE_PAGES = 0x0400, - LPROCFS_TYPE_USEC = 0x0800, + LPROCFS_TYPE_LOCKS = 0x0500, + LPROCFS_TYPE_LOCKSPS = 0x0600, + LPROCFS_TYPE_SECS = 0x0700, + LPROCFS_TYPE_USECS = 0x0800, + LPROCFS_TYPE_MASK = 0x0f00, - LPROCFS_TYPE_LATENCY = LPROCFS_TYPE_USEC | + LPROCFS_TYPE_LATENCY = LPROCFS_TYPE_USECS | LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, LPROCFS_TYPE_BYTES_FULL = LPROCFS_TYPE_BYTES | @@ -161,9 +165,9 @@ enum { #define LC_MIN_INIT ((~(__u64)0) >> 1) struct lprocfs_counter_header { - unsigned int lc_config; - const char *lc_name; /* must be static */ - const char *lc_units; /* must be static */ + enum lprocfs_counter_config lc_config; + const char *lc_name; /* must be static */ + const char *lc_units; /* must be static */ }; struct lprocfs_counter { @@ -535,8 +539,11 @@ extern int lprocfs_alloc_obd_stats(struct obd_device *obd, extern int lprocfs_alloc_md_stats(struct obd_device *obd, unsigned int num_private_stats); extern void lprocfs_counter_init(struct lprocfs_stats *stats, int index, - unsigned conf, const char *name, - const char *units); + enum lprocfs_counter_config config, + const char *name); +extern void lprocfs_counter_init_units(struct lprocfs_stats *stats, int index, + enum lprocfs_counter_config config, + const char *name, const char *units); extern void lprocfs_free_obd_stats(struct obd_device *obd); extern void lprocfs_free_md_stats(struct obd_device *obd); struct obd_export; @@ -920,24 +927,26 @@ int lprocfs_wr_nosquash_nids(const char __user *buffer, unsigned long count, #define proc_lustre_root NULL static inline void lprocfs_counter_add(struct lprocfs_stats *stats, - int index, long amount) + int index, long amount) { return; } -static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, - int index) +static inline void lprocfs_counter_incr(struct lprocfs_stats *stats, int index) { return; } static inline void lprocfs_counter_sub(struct lprocfs_stats *stats, - int index, long amount) + int index, long amount) +{ return; } +static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, int index) { return; } -static inline void lprocfs_counter_decr(struct lprocfs_stats *stats, - int index) +static inline void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + enum lprocfs_counter_config config, + const char *name) { return; } -static inline void lprocfs_counter_init(struct lprocfs_stats *stats, - int index, unsigned conf, - const char *name, const char *units) +static inline void lprocfs_counter_init_units(struct lprocfs_stats *stats, + int index, enum lprocfs_counter_config config, + const char *name, const char *units) { return; } static inline __u64 lc_read_helper(struct lprocfs_counter *lc, - enum lprocfs_fields_flags field) + enum lprocfs_fields_flags field) { return 0; } /* NB: we return !NULL to satisfy error checker */ diff --git a/lustre/include/uapi/linux/lustre/lustre_param.h b/lustre/include/uapi/linux/lustre/lustre_param.h index 8b91770..e9df823 100644 --- a/lustre/include/uapi/linux/lustre/lustre_param.h +++ b/lustre/include/uapi/linux/lustre/lustre_param.h @@ -70,6 +70,7 @@ #define PARAM_ID_UPCALL "identity_upcall=" /* identity upcall */ #define PARAM_ROOTSQUASH "root_squash=" /* root squash */ #define PARAM_NOSQUASHNIDS "nosquash_nids=" /* no squash nids */ +#define PARAM_AUTODEGRADE "autodegrade=" /* autodegrade OST's */ /* Prefixes for parameters handled by obd's proc methods (XXX_process_config) */ #define PARAM_OST "ost." diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index d23240c7..48e32bd 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -847,27 +847,37 @@ static int ldlm_pool_debugfs_init(struct ldlm_pool *pl) GOTO(out, rc = -ENOMEM); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, - LPROCFS_CNTR_AVGMINMAX, "granted", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "granted"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, - LPROCFS_CNTR_AVGMINMAX, "grant", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "grant"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, - LPROCFS_CNTR_AVGMINMAX, "cancel", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "cancel"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, - LPROCFS_CNTR_AVGMINMAX, "grant_rate", "locks/s"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKSPS, + "grant_rate"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_RATE_STAT, - LPROCFS_CNTR_AVGMINMAX, "cancel_rate", "locks/s"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKSPS, + "cancel_rate"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_PLAN_STAT, - LPROCFS_CNTR_AVGMINMAX, "grant_plan", "locks/s"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SLV_STAT, - LPROCFS_CNTR_AVGMINMAX, "slv", "slv"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKSPS, + "grant_plan"); + lprocfs_counter_init_units(pl->pl_stats, LDLM_POOL_SLV_STAT, + LPROCFS_CNTR_AVGMINMAX, "slv", "lock.secs"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, - LPROCFS_CNTR_AVGMINMAX, "shrink_request", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "shrink_request"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, - LPROCFS_CNTR_AVGMINMAX, "shrink_freed", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "shrink_freed"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_RECALC_STAT, - LPROCFS_CNTR_AVGMINMAX, "recalc_freed", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "recalc_freed"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_TIMING_STAT, - LPROCFS_CNTR_AVGMINMAX, "recalc_timing", "sec"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_SECS, + "recalc_timing"); debugfs_create_file("stats", 0644, pl->pl_debugfs_entry, pl->pl_stats, &ldebugfs_stats_seq_fops); diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 92fef33..6233920 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -744,7 +744,8 @@ int ldlm_namespace_sysfs_register(struct ldlm_namespace *ns) } lprocfs_counter_init(ns->ns_stats, LDLM_NSS_LOCKS, - LPROCFS_CNTR_AVGMINMAX, "locks", "locks"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_LOCKS, + "locks"); return err; } diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index 4dac2ae..094b29e 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -39,6 +39,7 @@ #include #include "llite_internal.h" +#include "lprocfs_status.h" #include "vvp_internal.h" static struct kobject *llite_kobj; @@ -1812,9 +1813,9 @@ static struct kobj_type sbi_ktype = { }; static const struct llite_file_opcode { - __u32 opcode; - __u32 type; - const char *opname; + __u32 lfo_opcode; + enum lprocfs_counter_config lfo_config; + const char *lfo_opname; } llite_opcode_table[LPROC_LL_FILE_OPCODES] = { /* file operation */ { LPROC_LL_READ_BYTES, LPROCFS_TYPE_BYTES_FULL, "read_bytes" }, @@ -1830,8 +1831,7 @@ static const struct llite_file_opcode { { LPROC_LL_LLSEEK, LPROCFS_TYPE_LATENCY, "seek" }, { LPROC_LL_FSYNC, LPROCFS_TYPE_LATENCY, "fsync" }, { LPROC_LL_READDIR, LPROCFS_TYPE_LATENCY, "readdir" }, - { LPROC_LL_INODE_OCOUNT,LPROCFS_TYPE_REQS | - LPROCFS_CNTR_AVGMINMAX | + { LPROC_LL_INODE_OCOUNT, LPROCFS_TYPE_REQS | LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "opencount" }, { LPROC_LL_INODE_OPCLTM,LPROCFS_TYPE_LATENCY, "openclosetime" }, /* inode operation */ @@ -1839,7 +1839,7 @@ static const struct llite_file_opcode { { LPROC_LL_TRUNC, LPROCFS_TYPE_LATENCY, "truncate" }, { LPROC_LL_FLOCK, LPROCFS_TYPE_LATENCY, "flock" }, { LPROC_LL_GETATTR, LPROCFS_TYPE_LATENCY, "getattr" }, - { LPROC_LL_FALLOCATE, LPROCFS_TYPE_LATENCY, "fallocate"}, + { LPROC_LL_FALLOCATE, LPROCFS_TYPE_LATENCY, "fallocate"}, /* dir inode operation */ { LPROC_LL_CREATE, LPROCFS_TYPE_LATENCY, "create" }, { LPROC_LL_LINK, LPROCFS_TYPE_LATENCY, "link" }, @@ -1933,20 +1933,11 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name) GOTO(out_debugfs, err = -ENOMEM); /* do counter init */ - for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) { - u32 type = llite_opcode_table[id].type; - void *ptr = "unknown"; - - if (type & LPROCFS_TYPE_REQS) - ptr = "reqs"; - else if (type & LPROCFS_TYPE_BYTES) - ptr = "bytes"; - else if (type & LPROCFS_TYPE_USEC) - ptr = "usec"; + for (id = 0; id < LPROC_LL_FILE_OPCODES; id++) lprocfs_counter_init(sbi->ll_stats, - llite_opcode_table[id].opcode, type, - llite_opcode_table[id].opname, ptr); - } + llite_opcode_table[id].lfo_opcode, + llite_opcode_table[id].lfo_config, + llite_opcode_table[id].lfo_opname); debugfs_create_file("stats", 0644, sbi->ll_debugfs_entry, sbi->ll_stats, &ldebugfs_stats_seq_fops); @@ -1957,8 +1948,8 @@ int ll_debugfs_register_super(struct super_block *sb, const char *name) GOTO(out_stats, err = -ENOMEM); for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) - lprocfs_counter_init(sbi->ll_ra_stats, id, 0, - ra_stat_string[id], "pages"); + lprocfs_counter_init(sbi->ll_ra_stats, id, LPROCFS_TYPE_PAGES, + ra_stat_string[id]); debugfs_create_file("read_ahead_stats", 0644, sbi->ll_debugfs_entry, sbi->ll_ra_stats, &ldebugfs_stats_seq_fops); diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index ad4e861..9001e37 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -1596,11 +1596,10 @@ void mdt_stats_counter_init(struct lprocfs_stats *stats, unsigned int offset) midx == LPROC_MDT_IO_WRITE_BYTES) lprocfs_counter_init(stats, oidx, LPROCFS_TYPE_BYTES_FULL, - mdt_stats[midx], "bytes"); + mdt_stats[midx]); else - lprocfs_counter_init(stats, oidx, - LPROCFS_TYPE_LATENCY, - mdt_stats[midx], "usecs"); + lprocfs_counter_init(stats, oidx, LPROCFS_TYPE_LATENCY, + mdt_stats[midx]); } } diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c index 5344f18..41cf93d 100644 --- a/lustre/mgs/lproc_mgs.c +++ b/lustre/mgs/lproc_mgs.c @@ -346,27 +346,25 @@ void lproc_mgs_cleanup(struct mgs_device *mgs) mgs->mgs_proc_live = NULL; } - lprocfs_free_per_client_stats(obd); + lprocfs_free_per_client_stats(obd); lprocfs_obd_cleanup(obd); - lprocfs_free_obd_stats(obd); - lprocfs_free_md_stats(obd); + lprocfs_free_obd_stats(obd); + lprocfs_free_md_stats(obd); } void mgs_counter_incr(struct obd_export *exp, int opcode) { - lprocfs_counter_incr(exp->exp_obd->obd_stats, opcode); - if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats != NULL) - lprocfs_counter_incr(exp->exp_nid_stats->nid_stats, opcode); + lprocfs_counter_incr(exp->exp_obd->obd_stats, opcode); + if (exp->exp_nid_stats && exp->exp_nid_stats->nid_stats != NULL) + lprocfs_counter_incr(exp->exp_nid_stats->nid_stats, opcode); } void mgs_stats_counter_init(struct lprocfs_stats *stats) { - lprocfs_counter_init(stats, LPROC_MGS_CONNECT, 0, "connect", "reqs"); - lprocfs_counter_init(stats, LPROC_MGS_DISCONNECT, 0, "disconnect", - "reqs"); - lprocfs_counter_init(stats, LPROC_MGS_EXCEPTION, 0, "exception", - "reqs"); - lprocfs_counter_init(stats, LPROC_MGS_TARGET_REG, 0, "tgtreg", "reqs"); - lprocfs_counter_init(stats, LPROC_MGS_TARGET_DEL, 0, "tgtdel", "reqs"); + lprocfs_counter_init(stats, LPROC_MGS_CONNECT, 0, "connect"); + lprocfs_counter_init(stats, LPROC_MGS_DISCONNECT, 0, "disconnect"); + lprocfs_counter_init(stats, LPROC_MGS_EXCEPTION, 0, "exception"); + lprocfs_counter_init(stats, LPROC_MGS_TARGET_REG, 0, "tgtreg"); + lprocfs_counter_init(stats, LPROC_MGS_TARGET_DEL, 0, "tgtdel"); } #endif diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index 7a117b9..d66aebe 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -4385,6 +4385,10 @@ active_err: GOTO(end, rc); } + /* For handling degraded zfs OST */ + if (class_match_param(ptr, PARAM_AUTODEGRADE, NULL) == 0) + GOTO(end, rc); + LCONSOLE_WARN("Ignoring unrecognized param '%s'\n", ptr); end: diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index f0c6118..ddd7fe6 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -694,8 +694,8 @@ static int __init obdclass_init(void) } lprocfs_counter_init(obd_memory, OBD_MEMORY_STAT, - LPROCFS_CNTR_AVGMINMAX, - "memused", "bytes"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_BYTES, + "memused"); #endif err = obd_zombie_impexp_init(); if (err) diff --git a/lustre/obdclass/dt_object.c b/lustre/obdclass/dt_object.c index ee17b36..b7059fb 100644 --- a/lustre/obdclass/dt_object.c +++ b/lustre/obdclass/dt_object.c @@ -1244,11 +1244,11 @@ int dt_tunables_fini(struct dt_device *dt) if (!dt) return -EINVAL; - if (dt->dd_def_attrs) + if (dt->dd_def_attrs) { sysfs_remove_files(&dt->dd_kobj, dt->dd_def_attrs); - - kobject_put(&dt->dd_kobj); - wait_for_completion(&dt->dd_kobj_unregister); + kobject_put(&dt->dd_kobj); + wait_for_completion(&dt->dd_kobj_unregister); + } return 0; } @@ -1273,6 +1273,7 @@ int dt_tunables_init(struct dt_device *dt, struct obd_type *type, rc = sysfs_create_files(&dt->dd_kobj, dt->dd_def_attrs); if (rc) { kobject_put(&dt->dd_kobj); + dt->dd_def_attrs = NULL; return rc; } diff --git a/lustre/obdclass/llog_osd.c b/lustre/obdclass/llog_osd.c index 8354dd5..ddd507e 100644 --- a/lustre/obdclass/llog_osd.c +++ b/lustre/obdclass/llog_osd.c @@ -339,16 +339,15 @@ static int llog_osd_declare_write_rec(const struct lu_env *env, lgi->lgi_buf.lb_len = chunk_size; lgi->lgi_buf.lb_buf = NULL; /* each time we update header */ - rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, - th); + rc = dt_declare_record_write(env, o, &lgi->lgi_buf, 0, th); if (rc || idx == 0) /* if error or just header */ RETURN(rc); /** * the pad record can be inserted so take into account double - * record size + * record size: pad and the actual record into a new block */ - lgi->lgi_buf.lb_len = chunk_size * 2; + lgi->lgi_buf.lb_len = rec->lrh_len * 2; lgi->lgi_buf.lb_buf = NULL; /* XXX: implement declared window or multi-chunks approach */ rc = dt_declare_record_write(env, o, &lgi->lgi_buf, -1, th); diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 88c2c4b..0176888 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -1484,8 +1484,34 @@ int lprocfs_register_stats(struct proc_dir_entry *root, const char *name, } EXPORT_SYMBOL(lprocfs_register_stats); -void lprocfs_counter_init(struct lprocfs_stats *stats, int index, - unsigned conf, const char *name, const char *units) +static const char *lprocfs_counter_config_units(const char *name, + enum lprocfs_counter_config config) +{ + const char *units; + + switch (config & LPROCFS_TYPE_MASK) { + default: + units = "reqs"; break; + case LPROCFS_TYPE_BYTES: + units = "bytes"; break; + case LPROCFS_TYPE_PAGES: + units = "pages"; break; + case LPROCFS_TYPE_LOCKS: + units = "locks"; break; + case LPROCFS_TYPE_LOCKSPS: + units = "locks/s"; break; + case LPROCFS_TYPE_SECS: + units = "secs"; break; + case LPROCFS_TYPE_USECS: + units = "usecs"; break; + } + + return units; +} + +void lprocfs_counter_init_units(struct lprocfs_stats *stats, int index, + enum lprocfs_counter_config config, + const char *name, const char *units) { struct lprocfs_counter_header *header; struct lprocfs_counter *percpu_cntr; @@ -1499,9 +1525,9 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index, LASSERTF(header != NULL, "Failed to allocate stats header:[%d]%s/%s\n", index, name, units); - header->lc_config = conf; - header->lc_name = name; - header->lc_units = units; + header->lc_config = config; + header->lc_name = name; + header->lc_units = units; num_cpu = lprocfs_stats_lock(stats, LPROCFS_GET_NUM_CPU, &flags); for (i = 0; i < num_cpu; ++i) { @@ -1518,6 +1544,15 @@ void lprocfs_counter_init(struct lprocfs_stats *stats, int index, } lprocfs_stats_unlock(stats, LPROCFS_GET_NUM_CPU, &flags); } +EXPORT_SYMBOL(lprocfs_counter_init_units); + +void lprocfs_counter_init(struct lprocfs_stats *stats, int index, + enum lprocfs_counter_config config, + const char *name) +{ + lprocfs_counter_init_units(stats, index, config, name, + lprocfs_counter_config_units(name, config)); +} EXPORT_SYMBOL(lprocfs_counter_init); static const char * const mps_stats[] = { @@ -1563,7 +1598,8 @@ int lprocfs_alloc_md_stats(struct obd_device *obd, return -ENOMEM; for (i = 0; i < ARRAY_SIZE(mps_stats); i++) { - lprocfs_counter_init(stats, i, 0, mps_stats[i], "reqs"); + lprocfs_counter_init(stats, i, LPROCFS_TYPE_REQS, + mps_stats[i]); if (!stats->ls_cnt_header[i].lc_name) { CERROR("Missing md_stat initializer md_op operation at offset %d. Aborting.\n", i); @@ -1595,24 +1631,18 @@ EXPORT_SYMBOL(lprocfs_free_md_stats); void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) { - lprocfs_counter_init(ldlm_stats, - LDLM_ENQUEUE - LDLM_FIRST_OPC, - 0, "ldlm_enqueue", "reqs"); - lprocfs_counter_init(ldlm_stats, - LDLM_CONVERT - LDLM_FIRST_OPC, - 0, "ldlm_convert", "reqs"); - lprocfs_counter_init(ldlm_stats, - LDLM_CANCEL - LDLM_FIRST_OPC, - 0, "ldlm_cancel", "reqs"); - lprocfs_counter_init(ldlm_stats, - LDLM_BL_CALLBACK - LDLM_FIRST_OPC, - 0, "ldlm_bl_callback", "reqs"); - lprocfs_counter_init(ldlm_stats, - LDLM_CP_CALLBACK - LDLM_FIRST_OPC, - 0, "ldlm_cp_callback", "reqs"); - lprocfs_counter_init(ldlm_stats, - LDLM_GL_CALLBACK - LDLM_FIRST_OPC, - 0, "ldlm_gl_callback", "reqs"); + lprocfs_counter_init(ldlm_stats, LDLM_ENQUEUE - LDLM_FIRST_OPC, + LPROCFS_TYPE_REQS, "ldlm_enqueue"); + lprocfs_counter_init(ldlm_stats, LDLM_CONVERT - LDLM_FIRST_OPC, + LPROCFS_TYPE_REQS, "ldlm_convert"); + lprocfs_counter_init(ldlm_stats, LDLM_CANCEL - LDLM_FIRST_OPC, + LPROCFS_TYPE_REQS, "ldlm_cancel"); + lprocfs_counter_init(ldlm_stats, LDLM_BL_CALLBACK - LDLM_FIRST_OPC, + LPROCFS_TYPE_REQS, "ldlm_bl_callback"); + lprocfs_counter_init(ldlm_stats, LDLM_CP_CALLBACK - LDLM_FIRST_OPC, + LPROCFS_TYPE_REQS, "ldlm_cp_callback"); + lprocfs_counter_init(ldlm_stats, LDLM_GL_CALLBACK - LDLM_FIRST_OPC, + LPROCFS_TYPE_REQS, "ldlm_gl_callback"); } EXPORT_SYMBOL(lprocfs_init_ldlm_stats); @@ -1642,7 +1672,9 @@ __s64 lprocfs_read_helper(struct lprocfs_counter *lc, ret = lc->lc_max; break; case LPROCFS_FIELDS_FLAGS_AVG: - ret = (lc->lc_max - lc->lc_min) / 2; + ret = div64_u64((flags & LPROCFS_STATS_FLAG_IRQ_SAFE ? + lc->lc_sum_irq : 0) + lc->lc_sum, + lc->lc_count); break; case LPROCFS_FIELDS_FLAGS_SUMSQUARE: ret = lc->lc_sumsquare; diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index e5ac808..0c40f08 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -1160,24 +1160,19 @@ int lu_site_init(struct lu_site *s, struct lu_device *top) return -ENOMEM; } - lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, - 0, "created", "created"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, - 0, "cache_hit", "cache_hit"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, - 0, "cache_miss", "cache_miss"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, - 0, "cache_race", "cache_race"); - lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, - 0, "cache_death_race", "cache_death_race"); - lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, - 0, "lru_purged", "lru_purged"); + lprocfs_counter_init(s->ls_stats, LU_SS_CREATED, 0, "created"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_HIT, 0, "cache_hit"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_MISS, 0, "cache_miss"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_RACE, 0, "cache_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_CACHE_DEATH_RACE, + 0, "cache_death_race"); + lprocfs_counter_init(s->ls_stats, LU_SS_LRU_PURGED, 0, "lru_purged"); INIT_LIST_HEAD(&s->ls_linkage); - s->ls_top_dev = top; - top->ld_site = s; - lu_device_get(top); - lu_ref_add(&top->ld_reference, "site-top", s); + s->ls_top_dev = top; + top->ld_site = s; + lu_device_get(top); + lu_ref_add(&top->ld_reference, "site-top", s); INIT_LIST_HEAD(&s->ls_ld_linkage); spin_lock_init(&s->ls_ld_lock); diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index b65d01a..5de02d8 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -779,11 +779,11 @@ static int echo_srv_init0(const struct lu_env *env, if (!lprocfs_obd_setup(obd, true) && lprocfs_alloc_obd_stats(obd, LPROC_ECHO_LAST) == 0) { lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_READ_BYTES, - LPROCFS_CNTR_AVGMINMAX, - "read_bytes", "bytes"); + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "read_bytes"); lprocfs_counter_init(obd->obd_stats, LPROC_ECHO_WRITE_BYTES, - LPROCFS_CNTR_AVGMINMAX, - "write_bytes", "bytes"); + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_BYTES, + "write_bytes"); } ptlrpc_init_client(LDLM_CB_REQUEST_PORTAL, LDLM_CB_REPLY_PORTAL, diff --git a/lustre/ofd/lproc_ofd.c b/lustre/ofd/lproc_ofd.c index 413adc2..068bac5 100644 --- a/lustre/ofd/lproc_ofd.c +++ b/lustre/ofd/lproc_ofd.c @@ -1059,35 +1059,35 @@ void ofd_stats_counter_init(struct lprocfs_stats *stats, unsigned int offset) LASSERT(stats && stats->ls_num >= LPROC_OFD_STATS_LAST); lprocfs_counter_init(stats, LPROC_OFD_STATS_READ_BYTES, - LPROCFS_TYPE_BYTES_FULL, "read_bytes", "bytes"); + LPROCFS_TYPE_BYTES_FULL, "read_bytes"); lprocfs_counter_init(stats, LPROC_OFD_STATS_WRITE_BYTES, - LPROCFS_TYPE_BYTES_FULL, "write_bytes", "bytes"); + LPROCFS_TYPE_BYTES_FULL, "write_bytes"); lprocfs_counter_init(stats, LPROC_OFD_STATS_READ, - LPROCFS_TYPE_LATENCY, "read", "usecs"); + LPROCFS_TYPE_LATENCY, "read"); lprocfs_counter_init(stats, LPROC_OFD_STATS_WRITE, - LPROCFS_TYPE_LATENCY, "write", "usecs"); + LPROCFS_TYPE_LATENCY, "write"); lprocfs_counter_init(stats, LPROC_OFD_STATS_GETATTR, - LPROCFS_TYPE_LATENCY, "getattr", "usecs"); + LPROCFS_TYPE_LATENCY, "getattr"); lprocfs_counter_init(stats, LPROC_OFD_STATS_SETATTR, - LPROCFS_TYPE_LATENCY, "setattr", "usecs"); + LPROCFS_TYPE_LATENCY, "setattr"); lprocfs_counter_init(stats, LPROC_OFD_STATS_PUNCH, - LPROCFS_TYPE_LATENCY, "punch", "usecs"); + LPROCFS_TYPE_LATENCY, "punch"); lprocfs_counter_init(stats, LPROC_OFD_STATS_SYNC, - LPROCFS_TYPE_LATENCY, "sync", "usecs"); + LPROCFS_TYPE_LATENCY, "sync"); lprocfs_counter_init(stats, LPROC_OFD_STATS_DESTROY, - LPROCFS_TYPE_LATENCY, "destroy", "usecs"); + LPROCFS_TYPE_LATENCY, "destroy"); lprocfs_counter_init(stats, LPROC_OFD_STATS_CREATE, - LPROCFS_TYPE_LATENCY, "create", "usecs"); + LPROCFS_TYPE_LATENCY, "create"); lprocfs_counter_init(stats, LPROC_OFD_STATS_STATFS, - LPROCFS_TYPE_LATENCY, "statfs", "usecs"); + LPROCFS_TYPE_LATENCY, "statfs"); lprocfs_counter_init(stats, LPROC_OFD_STATS_GET_INFO, - LPROCFS_TYPE_LATENCY, "get_info", "usecs"); + LPROCFS_TYPE_LATENCY, "get_info"); lprocfs_counter_init(stats, LPROC_OFD_STATS_SET_INFO, - LPROCFS_TYPE_LATENCY, "set_info", "usecs"); + LPROCFS_TYPE_LATENCY, "set_info"); lprocfs_counter_init(stats, LPROC_OFD_STATS_QUOTACTL, - LPROCFS_TYPE_LATENCY, "quotactl", "usecs"); + LPROCFS_TYPE_LATENCY, "quotactl"); lprocfs_counter_init(stats, LPROC_OFD_STATS_PREALLOC, - LPROCFS_TYPE_LATENCY, "prealloc", "usecs"); + LPROCFS_TYPE_LATENCY, "prealloc"); } LPROC_SEQ_FOPS(lprocfs_nid_stats_clear); diff --git a/lustre/osd-ldiskfs/osd_handler.c b/lustre/osd-ldiskfs/osd_handler.c index 181cd50..de10d2a 100644 --- a/lustre/osd-ldiskfs/osd_handler.c +++ b/lustre/osd-ldiskfs/osd_handler.c @@ -1764,6 +1764,70 @@ static void __osd_th_check_slow(void *oth, struct osd_device *dev, #endif /* OSD_THANDLE_STATS */ /* + * in some cases (like overstriped files) the same operations on the same + * objects are declared many times and this may lead to huge number of + * credits which can be a problem and/or cause performance degradation. + * this function is to remember what declarations have been made within + * a given thandle and then skip duplications. + * limit it's scope so that regular small transactions don't need all + * this overhead with allocations, lists. + * also, limit scope to the specific objects like llogs, etc. + */ +static inline bool osd_check_special_fid(const struct lu_fid *f) +{ + if (fid_seq_is_llog(f->f_seq)) + return true; + if (f->f_seq == FID_SEQ_LOCAL_FILE && + f->f_oid == MDD_LOV_OBJ_OID) + return true; + return false; +} + +bool osd_tx_was_declared(const struct lu_env *env, struct osd_thandle *oth, + struct dt_object *dt, enum dt_txn_op op, loff_t pos) +{ + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_device *osd = osd_obj2dev(osd_dt_obj(dt)); + struct osd_thread_info *oti = osd_oti_get(env); + struct osd_obj_declare *old; + + if (osd->od_is_ost) + return false; + + /* small transactions don't need this overhead */ + if (oti->oti_declare_ops[DTO_OBJECT_CREATE] < 10 && + oti->oti_declare_ops[DTO_WRITE_BASE] < 10) + return false; + + if (osd_check_special_fid(fid) == 0) + return false; + + list_for_each_entry(old, &oth->ot_declare_list, old_list) { + if (old->old_op == op && old->old_pos == pos && + lu_fid_eq(&old->old_fid, fid)) + return true; + } + OBD_ALLOC_PTR(old); + if (unlikely(old == NULL)) + return false; + old->old_fid = *lu_object_fid(&dt->do_lu); + old->old_op = op; + old->old_pos = pos; + list_add(&old->old_list, &oth->ot_declare_list); + return false; +} + +void osd_tx_declaration_free(struct osd_thandle *oth) +{ + struct osd_obj_declare *old, *tmp; + + list_for_each_entry_safe(old, tmp, &oth->ot_declare_list, old_list) { + list_del_init(&old->old_list); + OBD_FREE_PTR(old); + } +} + +/* * Concurrency: doesn't access mutable data. */ static int osd_param_is_not_sane(const struct osd_device *dev, @@ -1848,6 +1912,7 @@ static struct thandle *osd_trans_create(const struct lu_env *env, INIT_LIST_HEAD(&oh->ot_commit_dcb_list); INIT_LIST_HEAD(&oh->ot_stop_dcb_list); INIT_LIST_HEAD(&oh->ot_trunc_locks); + INIT_LIST_HEAD(&oh->ot_declare_list); osd_th_alloced(oh); memset(oti->oti_declare_ops, 0, @@ -1938,6 +2003,9 @@ static int osd_trans_start(const struct lu_env *env, struct dt_device *d, static unsigned long last_printed; static int last_credits; + lprocfs_counter_add(dev->od_stats, + LPROC_OSD_TOO_MANY_CREDITS, 1); + /* * don't make noise on a tiny testing systems * actual credits misuse will be caught anyway @@ -2068,6 +2136,8 @@ static int osd_trans_stop(const struct lu_env *env, struct dt_device *dt, qtrans = oh->ot_quota_trans; oh->ot_quota_trans = NULL; + osd_tx_declaration_free(oh); + /* move locks to local list, stop tx, execute truncates */ list_splice(&oh->ot_trunc_locks, &truncates); @@ -3714,6 +3784,9 @@ static int osd_declare_create(const struct lu_env *env, struct dt_object *dt, oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle == NULL); + if (osd_tx_was_declared(env, oh, dt, DTO_OBJECT_CREATE, 0)) + RETURN(0); + /* * EA object consumes more credits than regular object: osd_mk_index * vs. osd_mkreg: osd_mk_index will create 2 blocks for root_node and diff --git a/lustre/osd-ldiskfs/osd_internal.h b/lustre/osd-ldiskfs/osd_internal.h index b0b4253..5eba489 100644 --- a/lustre/osd-ldiskfs/osd_internal.h +++ b/lustre/osd-ldiskfs/osd_internal.h @@ -451,6 +451,11 @@ struct osd_thandle { ktime_t oth_started; #endif struct list_head ot_trunc_locks; + /* + * list of declarations, used for large transactions to check + * for duplicates, like llogs + */ + struct list_head ot_declare_list; }; /** @@ -471,6 +476,13 @@ enum dt_txn_op { DTO_NR }; +struct osd_obj_declare { + struct list_head old_list; + struct lu_fid old_fid; + enum dt_txn_op old_op; + loff_t old_pos; +}; + /* * osd dev stats */ @@ -490,6 +502,7 @@ enum { LPROC_OSD_THANDLE_OPEN, LPROC_OSD_THANDLE_CLOSING, #endif + LPROC_OSD_TOO_MANY_CREDITS, LPROC_OSD_LAST, }; #endif @@ -1747,6 +1760,9 @@ static inline bool bdev_integrity_enabled(struct block_device *bdev, int rw) return false; } +bool osd_tx_was_declared(const struct lu_env *env, struct osd_thandle *oth, + struct dt_object *dt, enum dt_txn_op op, loff_t p); + #ifdef HAVE_DQUOT_TRANSFER_WITH_USER_NS #define osd_dquot_transfer(ns, i, a) dquot_transfer((ns), (i), (a)) #else diff --git a/lustre/osd-ldiskfs/osd_io.c b/lustre/osd-ldiskfs/osd_io.c index ad40d6f..e0f34de 100644 --- a/lustre/osd-ldiskfs/osd_io.c +++ b/lustre/osd-ldiskfs/osd_io.c @@ -1955,6 +1955,9 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, bits = sb->s_blocksize_bits; bs = 1 << bits; + if (osd_tx_was_declared(env, oh, dt, DTO_WRITE_BASE, _pos)) + RETURN(0); + if (_pos == -1) { /* if this is an append, then we * should expect cross-block record diff --git a/lustre/osd-ldiskfs/osd_lproc.c b/lustre/osd-ldiskfs/osd_lproc.c index 4abf45c..5bc504c 100644 --- a/lustre/osd-ldiskfs/osd_lproc.c +++ b/lustre/osd-ldiskfs/osd_lproc.c @@ -84,32 +84,34 @@ static int osd_stats_init(struct osd_device *osd) ENTRY; osd->od_stats = lprocfs_alloc_stats(LPROC_OSD_LAST, 0); if (osd->od_stats) { - lprocfs_counter_init(osd->od_stats, LPROC_OSD_GET_PAGE, - LPROCFS_CNTR_AVGMINMAX|LPROCFS_CNTR_STDDEV, - "get_page", "usec"); - lprocfs_counter_init(osd->od_stats, LPROC_OSD_NO_PAGE, - LPROCFS_CNTR_AVGMINMAX, - "get_page_failures", "num"); - lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_ACCESS, - LPROCFS_CNTR_AVGMINMAX, - "cache_access", "pages"); - lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_HIT, - LPROCFS_CNTR_AVGMINMAX, - "cache_hit", "pages"); - lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_MISS, - LPROCFS_CNTR_AVGMINMAX, - "cache_miss", "pages"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_GET_PAGE, + LPROCFS_TYPE_LATENCY, "get_page"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_NO_PAGE, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_REQS, + "get_page_failures"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_ACCESS, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "cache_access"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_HIT, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "cache_hit"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_MISS, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_PAGES, + "cache_miss"); #if OSD_THANDLE_STATS - lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_STARTING, - LPROCFS_CNTR_AVGMINMAX, - "thandle starting", "usec"); - lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_OPEN, - LPROCFS_CNTR_AVGMINMAX, - "thandle open", "usec"); - lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_CLOSING, - LPROCFS_CNTR_AVGMINMAX, - "thandle closing", "usec"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_STARTING, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_USECS, + "thandle starting"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_OPEN, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_USECS, + "thandle open"); + lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_CLOSING, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_USECS, + "thandle closing"); #endif + lprocfs_counter_init(osd->od_stats, LPROC_OSD_TOO_MANY_CREDITS, + LPROCFS_CNTR_AVGMINMAX|LPROCFS_TYPE_REQS, + "many_credits"); result = 0; } @@ -787,14 +789,14 @@ ssize_t index_backup_store(struct kobject *kobj, struct attribute *attr, LUSTRE_RW_ATTR(index_backup); struct ldebugfs_vars ldebugfs_osd_obd_vars[] = { - { .name = "oi_scrub", - .fops = &ldiskfs_osd_oi_scrub_fops }, - { .name = "readcache_max_filesize", - .fops = &ldiskfs_osd_readcache_fops }, - { .name = "readcache_max_io_mb", - .fops = &ldiskfs_osd_readcache_max_io_fops }, - { .name = "writethrough_max_io_mb", - .fops = &ldiskfs_osd_writethrough_max_io_fops }, + { .name = "oi_scrub", + .fops = &ldiskfs_osd_oi_scrub_fops }, + { .name = "readcache_max_filesize", + .fops = &ldiskfs_osd_readcache_fops }, + { .name = "readcache_max_io_mb", + .fops = &ldiskfs_osd_readcache_max_io_fops }, + { .name = "writethrough_max_io_mb", + .fops = &ldiskfs_osd_writethrough_max_io_fops }, { NULL } }; diff --git a/lustre/osd-zfs/osd_handler.c b/lustre/osd-zfs/osd_handler.c index 192ce83..acc2b4f 100644 --- a/lustre/osd-zfs/osd_handler.c +++ b/lustre/osd-zfs/osd_handler.c @@ -1247,12 +1247,10 @@ static int osd_mount(const struct lu_env *env, o->od_posix_acl = 1; osd_unlinked_drain(env, o); -err: - if (rc && o->od_os) { - osd_dmu_objset_disown(o->od_os, B_TRUE, o); - o->od_os = NULL; - } + RETURN(0); + +err: RETURN(rc); } @@ -1331,7 +1329,59 @@ out: } static struct lu_device *osd_device_fini(const struct lu_env *env, - struct lu_device *dev); + struct lu_device *d) +{ + struct osd_device *o = osd_dev(d); + int rc; + + ENTRY; + osd_index_backup(env, o, false); + if (o->od_os) { + osd_objset_unregister_callbacks(o); + if (!o->od_dt_dev.dd_rdonly) { + osd_sync(env, lu2dt_dev(d)); + txg_wait_callbacks( + spa_get_dsl(dmu_objset_spa(o->od_os))); + } + } + + /* now with all the callbacks completed we can cleanup the remainings */ + osd_shutdown(env, o); + osd_scrub_cleanup(env, o); + + rc = osd_procfs_fini(o); + if (rc) { + CERROR("proc fini error %d\n", rc); + RETURN(ERR_PTR(rc)); + } + + if (o->od_os) + osd_umount(env, o); + + RETURN(NULL); +} + + +static struct lu_device *osd_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osd_device *o = osd_dev(d); + + ENTRY; + /* XXX: make osd top device in order to release reference */ + if (d->ld_site) { + d->ld_site->ls_top_dev = d; + lu_site_purge(env, d->ld_site, -1); + lu_site_print(env, d->ld_site, &d->ld_site->ls_obj_hash.nelems, + D_ERROR, lu_cdebug_printer); + } + if (o->od_site.ls_bottom_dev) + lu_site_fini(&o->od_site); + dt_device_fini(&o->od_dt_dev); + OBD_FREE_PTR(o); + + RETURN(NULL); +} static struct lu_device *osd_device_alloc(const struct lu_env *env, struct lu_device_type *type, @@ -1359,70 +1409,22 @@ static struct lu_device *osd_device_alloc(const struct lu_env *env, rc = osd_device_init0(env, dev, cfg); if (rc == 0) { rc = osd_mount(env, dev, cfg); - if (rc) + if (rc) { osd_device_fini(env, osd2lu_dev(dev)); - } - if (rc) + osd_device_free(env, osd2lu_dev(dev)); + dev = NULL; + } + } else { dt_device_fini(&dev->od_dt_dev); + } } - if (unlikely(rc != 0)) + if (unlikely(rc != 0) && dev) OBD_FREE_PTR(dev); return rc == 0 ? osd2lu_dev(dev) : ERR_PTR(rc); } -static struct lu_device *osd_device_free(const struct lu_env *env, - struct lu_device *d) -{ - struct osd_device *o = osd_dev(d); - ENTRY; - - /* XXX: make osd top device in order to release reference */ - d->ld_site->ls_top_dev = d; - lu_site_purge(env, d->ld_site, -1); - lu_site_print(env, d->ld_site, &d->ld_site->ls_obj_hash.nelems, - D_ERROR, lu_cdebug_printer); - lu_site_fini(&o->od_site); - dt_device_fini(&o->od_dt_dev); - OBD_FREE_PTR(o); - - RETURN (NULL); -} - -static struct lu_device *osd_device_fini(const struct lu_env *env, - struct lu_device *d) -{ - struct osd_device *o = osd_dev(d); - int rc; - ENTRY; - - osd_index_backup(env, o, false); - if (o->od_os) { - osd_objset_unregister_callbacks(o); - if (!o->od_dt_dev.dd_rdonly) { - osd_sync(env, lu2dt_dev(d)); - txg_wait_callbacks( - spa_get_dsl(dmu_objset_spa(o->od_os))); - } - } - - /* now with all the callbacks completed we can cleanup the remainings */ - osd_shutdown(env, o); - osd_scrub_cleanup(env, o); - - rc = osd_procfs_fini(o); - if (rc) { - CERROR("proc fini error %d\n", rc); - RETURN(ERR_PTR(rc)); - } - - if (o->od_os) - osd_umount(env, o); - - RETURN(NULL); -} - static int osd_device_init(const struct lu_env *env, struct lu_device *d, const char *name, struct lu_device *next) { diff --git a/lustre/osd-zfs/osd_internal.h b/lustre/osd-zfs/osd_internal.h index 176ef7b..4e20832 100644 --- a/lustre/osd-zfs/osd_internal.h +++ b/lustre/osd-zfs/osd_internal.h @@ -422,6 +422,8 @@ enum osd_destroy_type { OSD_DESTROY_ASYNC = 2, }; +#define OSD_MAX_DBUFS 2 /* how many dbufs to cache in object */ + struct osd_object { struct dt_object oo_dt; /* @@ -475,6 +477,7 @@ struct osd_object { uint64_t oo_parent; /* used only at object creation */ }; struct lu_object_header *oo_header; + dmu_buf_t *oo_dbs[OSD_MAX_DBUFS]; }; int osd_statfs(const struct lu_env *, struct dt_device *, struct obd_statfs *, @@ -1175,4 +1178,6 @@ osd_index_backup(const struct lu_env *env, struct osd_device *osd, bool backup) #define osd_dmu_offset_next(os, obj, hole, res) (EOPNOTSUPP) #endif +extern char osd_0copy_tag[]; + #endif /* _OSD_INTERNAL_H */ diff --git a/lustre/osd-zfs/osd_io.c b/lustre/osd-zfs/osd_io.c index f267367..d206acd 100644 --- a/lustre/osd-zfs/osd_io.c +++ b/lustre/osd-zfs/osd_io.c @@ -60,7 +60,7 @@ #include #include -static char osd_0copy_tag[] = "zerocopy"; +char osd_0copy_tag[] = "zerocopy"; static void dbuf_set_pending_evict(dmu_buf_t *db) { @@ -170,6 +170,7 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); + loff_t _pos = pos, max = 0; struct osd_thandle *oh; uint64_t oid; ENTRY; @@ -190,9 +191,44 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, /* XXX: we still miss for append declaration support in ZFS * -1 means append which is used by llog mostly, llog * can grow upto LLOG_MIN_CHUNK_SIZE*8 records */ + max = max_t(loff_t, 256 * 8 * LLOG_MIN_CHUNK_SIZE, + obj->oo_attr.la_size + (2 << 20)); if (pos == -1) - pos = max_t(loff_t, 256 * 8 * LLOG_MIN_CHUNK_SIZE, - obj->oo_attr.la_size + (2 << 20)); + pos = max; + if (obj->oo_dn) { + loff_t tstart, tend, end = pos + buf->lb_len; + dmu_tx_hold_t *txh; + + /* try to find a close declared window to fit/extend */ + for (txh = list_head(&oh->ot_tx->tx_holds); txh != NULL; + txh = list_next(&oh->ot_tx->tx_holds, txh)) { + if (obj->oo_dn != txh->txh_dnode) + continue; + if (txh->txh_type != THT_WRITE) + continue; + + /* bytes already declared in this handle */ + tstart = txh->txh_arg1; + tend = txh->txh_arg1 + txh->txh_arg2; + + if (pos < tstart) + tstart = pos; + if (tend < end) + tend = end; + /* if this is an append, then extend it */ + if (_pos == -1 && txh->txh_arg1 == max) + tend += buf->lb_len; + /* don't let too big appends */ + if (tend - tstart > 4*1024*1024) + continue; + if (pos >= tend || end <= tstart) + continue; + + txh->txh_arg1 = tstart; + txh->txh_arg2 = tend - tstart; + return 0; + } + } osd_tx_hold_write(oh->ot_tx, oid, obj->oo_dn, pos, buf->lb_len); /* dt_declare_write() is usually called for system objects, such @@ -203,9 +239,86 @@ static ssize_t osd_declare_write(const struct lu_env *env, struct dt_object *dt, 0, oh, NULL, OSD_QID_BLK)); } +static dmu_buf_t *osd_get_dbuf(struct osd_object *obj, uint64_t offset) +{ + dmu_buf_t **dbs = obj->oo_dbs; + uint64_t blkid; + int i; + + blkid = dbuf_whichblock(obj->oo_dn, 0, offset); + for (i = 0; i < OSD_MAX_DBUFS; i++) { + dmu_buf_impl_t *dbi = (void *)dbs[i]; + if (!dbs[i]) + continue; + if (dbi->db_blkid == blkid) + return dbs[i]; + } + return (dmu_buf_t *)dbuf_hold(obj->oo_dn, blkid, osd_0copy_tag); +} + +static void osd_put_dbuf(struct osd_object *obj, dmu_buf_t *db) +{ + dmu_buf_t **dbs = obj->oo_dbs; + int i; + + for (i = 0; i < OSD_MAX_DBUFS; i++) { + if (dbs[i] == db) + return; + } + /* get rid of dbuf with blkd > 0 */ + for (i = 0; i < OSD_MAX_DBUFS; i++) { + if (dbs[i] == NULL) { + dbs[i] = db; + return; + } + if (dbs[i]->db_offset > 0) { + /* replace this one */ + dbuf_rele((dmu_buf_impl_t *)dbs[i], osd_0copy_tag); + dbs[i] = db; + return; + } + } + LBUG(); +} + +static ssize_t osd_write_llog_header(struct osd_object *obj, + const struct lu_buf *buf, loff_t *pos, + struct osd_thandle *oh) +{ + int bufoff, tocpy; + int len = buf->lb_len; + loff_t offset = *pos; + char *data = buf->lb_buf; + + while (len > 0) { + dmu_buf_t *db = osd_get_dbuf(obj, offset); + + bufoff = offset - db->db_offset; + tocpy = MIN(db->db_size - bufoff, len); + if (tocpy == db->db_size) + dmu_buf_will_fill(db, oh->ot_tx); + else + dmu_buf_will_dirty(db, oh->ot_tx); + LASSERT(offset >= db->db_offset); + LASSERT(offset + tocpy <= db->db_offset + db->db_size); + (void) memcpy((char *)db->db_data + bufoff, data, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, oh->ot_tx); + + offset += tocpy; + data += tocpy; + len -= tocpy; + + osd_put_dbuf(obj, db); + } + + return 0; +} + static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, const struct lu_buf *buf, loff_t *pos, - struct thandle *th) + struct thandle *th) { struct osd_object *obj = osd_dt_obj(dt); struct osd_device *osd = osd_obj2dev(obj); @@ -225,8 +338,12 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, if (obj->oo_destroyed) GOTO(out, rc = -ENOENT); - osd_dmu_write(osd, obj->oo_dn, offset, (uint64_t)buf->lb_len, - buf->lb_buf, oh->ot_tx); + if (fid_is_llog(lu_object_fid(&dt->do_lu))) { + osd_write_llog_header(obj, buf, pos, oh); + } else { + osd_dmu_write(osd, obj->oo_dn, offset, (uint64_t)buf->lb_len, + buf->lb_buf, oh->ot_tx); + } write_lock(&obj->oo_attr_lock); if (obj->oo_attr.la_size < offset + buf->lb_len) { obj->oo_attr.la_size = offset + buf->lb_len; diff --git a/lustre/osd-zfs/osd_lproc.c b/lustre/osd-zfs/osd_lproc.c index 45936e8..9fa7a7a 100644 --- a/lustre/osd-zfs/osd_lproc.c +++ b/lustre/osd-zfs/osd_lproc.c @@ -51,39 +51,39 @@ static int osd_stats_init(struct osd_device *osd) osd->od_stats = lprocfs_alloc_stats(LPROC_OSD_LAST, 0); if (osd->od_stats) { lprocfs_counter_init(osd->od_stats, LPROC_OSD_GET_PAGE, - LPROCFS_CNTR_AVGMINMAX|LPROCFS_CNTR_STDDEV, - "get_page", "usec"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV | + LPROCFS_TYPE_USECS, "get_page"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_NO_PAGE, - LPROCFS_CNTR_AVGMINMAX, - "get_page_failures", "num"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_REQS, + "get_page_failures"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_ACCESS, - LPROCFS_CNTR_AVGMINMAX, - "cache_access", "pages"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, + "cache_access"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_HIT, - LPROCFS_CNTR_AVGMINMAX, - "cache_hit", "pages"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, + "cache_hit"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_CACHE_MISS, - LPROCFS_CNTR_AVGMINMAX, - "cache_miss", "pages"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, + "cache_miss"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_COPY_IO, - LPROCFS_CNTR_AVGMINMAX, - "copy", "pages"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, + "copy"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_ZEROCOPY_IO, - LPROCFS_CNTR_AVGMINMAX, - "zerocopy", "pages"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, + "zerocopy"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_TAIL_IO, - LPROCFS_CNTR_AVGMINMAX, - "tail", "pages"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_PAGES, + "tail"); #ifdef OSD_THANDLE_STATS lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_STARTING, - LPROCFS_CNTR_AVGMINMAX, - "thandle_starting", "usec"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_USECS, + "thandle_starting"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_OPEN, - LPROCFS_CNTR_AVGMINMAX, - "thandle_open", "usec"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_USECS, + "thandle_open"); lprocfs_counter_init(osd->od_stats, LPROC_OSD_THANDLE_CLOSING, - LPROCFS_CNTR_AVGMINMAX, - "thandle_closing", "usec"); + LPROCFS_CNTR_AVGMINMAX | LPROCFS_TYPE_USECS, + "thandle_closing"); #endif result = 0; } diff --git a/lustre/osd-zfs/osd_object.c b/lustre/osd-zfs/osd_object.c index 4a04370..56c31c9 100644 --- a/lustre/osd-zfs/osd_object.c +++ b/lustre/osd-zfs/osd_object.c @@ -310,6 +310,7 @@ struct lu_object *osd_object_alloc(const struct lu_env *env, struct lu_object *l; struct lu_object_header *h; struct osd_device *o = osd_dev(d); + int i; l = &mo->oo_dt.do_lu; if (unlikely(o->od_in_init)) { @@ -336,6 +337,8 @@ struct lu_object *osd_object_alloc(const struct lu_env *env, init_rwsem(&mo->oo_guard); rwlock_init(&mo->oo_attr_lock); mo->oo_destroy = OSD_DESTROY_NONE; + for (i = 0; i < OSD_MAX_DBUFS; i++) + mo->oo_dbs[i] = NULL; return l; } else { return NULL; @@ -902,6 +905,13 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l) { struct osd_object *obj = osd_obj(l); const struct lu_fid *fid = lu_object_fid(l); + dmu_buf_t **dbs = obj->oo_dbs; + int i; + + for (i = 0; i < OSD_MAX_DBUFS; i++) { + if (dbs[i]) + dbuf_rele((dmu_buf_impl_t *)dbs[i], osd_0copy_tag); + } if (obj->oo_dn) { if (likely(!fid_is_acct(fid))) { diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 0e0b170..40b733f 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -207,9 +207,9 @@ ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name, { struct dentry *svc_debugfs_entry; struct lprocfs_stats *svc_stats; + enum lprocfs_counter_config config = LPROCFS_CNTR_AVGMINMAX | + LPROCFS_CNTR_STDDEV; int i; - unsigned int svc_counter_config = LPROCFS_CNTR_AVGMINMAX | - LPROCFS_CNTR_STDDEV; LASSERT(!*debugfs_root_ret); LASSERT(!*stats_ret); @@ -225,37 +225,33 @@ ptlrpc_ldebugfs_register(struct dentry *root, char *dir, char *name, svc_debugfs_entry = root; lprocfs_counter_init(svc_stats, PTLRPC_REQWAIT_CNTR, - svc_counter_config, "req_waittime", "usec"); + config | LPROCFS_TYPE_USECS, "req_waittime"); lprocfs_counter_init(svc_stats, PTLRPC_REQQDEPTH_CNTR, - svc_counter_config, "req_qdepth", "reqs"); + config | LPROCFS_TYPE_REQS, "req_qdepth"); lprocfs_counter_init(svc_stats, PTLRPC_REQACTIVE_CNTR, - svc_counter_config, "req_active", "reqs"); + config | LPROCFS_TYPE_REQS, "req_active"); lprocfs_counter_init(svc_stats, PTLRPC_TIMEOUT, - svc_counter_config, "req_timeout", "sec"); - lprocfs_counter_init(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, - svc_counter_config, "reqbuf_avail", "bufs"); + config | LPROCFS_TYPE_SECS, "req_timeout"); + lprocfs_counter_init_units(svc_stats, PTLRPC_REQBUF_AVAIL_CNTR, + config, "reqbuf_avail", "bufs"); for (i = 0; i < EXTRA_LAST_OPC; i++) { - char *units; + enum lprocfs_counter_config extra_type = LPROCFS_TYPE_REQS; switch (i) { case BRW_WRITE_BYTES: case BRW_READ_BYTES: - units = "bytes"; - break; - default: - units = "reqs"; + extra_type = LPROCFS_TYPE_BYTES; break; } lprocfs_counter_init(svc_stats, PTLRPC_LAST_CNTR + i, - svc_counter_config, - ll_eopcode2str(i), units); + config | extra_type, ll_eopcode2str(i)); } for (i = 0; i < LUSTRE_MAX_OPCODES; i++) { __u32 opcode = ll_rpc_opcode_table[i].opcode; - lprocfs_counter_init(svc_stats, - EXTRA_MAX_OPCODES + i, svc_counter_config, - ll_opcode2str(opcode), "usec"); + lprocfs_counter_init(svc_stats, EXTRA_MAX_OPCODES + i, + config | LPROCFS_TYPE_USECS, + ll_opcode2str(opcode)); } debugfs_create_file(name, 0644, svc_debugfs_entry, svc_stats, diff --git a/lustre/scripts/statechange-lustre.sh b/lustre/scripts/statechange-lustre.sh index b06e490..f7b08dd 100755 --- a/lustre/scripts/statechange-lustre.sh +++ b/lustre/scripts/statechange-lustre.sh @@ -53,14 +53,17 @@ sync_degrade_state() local dataset="$1" local state="$2" local service=$($ZFS list -H -o lustre:svname ${dataset}) - - zed_log_msg "Lustre:sync_degrade_state pool:${dataset} degraded:${state}" + local autodegrade=$($ZFS get -rH -s local -t filesystem -o value \ + lustre:autodegrade ${dataset}) if [ -n "${service}" ] && [ "${service}" != "-" ] ; then local current=$($LCTL get_param -n obdfilter.${service}.degraded) - if [ "${current}" != "${state}" ] ; then + if [ "${current}" != "${state}" ] && + [ "${autodegrade}" == "on" ] ; then $LCTL set_param obdfilter.${service}.degraded=${state} + ds_state="pool:${dataset} degraded:${state}" + zed_log_msg "Lustre:sync_degrade_state $ds_state" fi fi } diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index e304f76..beb9f0d 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -3119,9 +3119,9 @@ test_144a() { kill -9 $pid >/dev/null 2>&1 done - before=$(date +%s) + before=$SECONDS fail mds1 - after=$(date +%s) + after=$SECONDS # here we measure MDT stop + MDT start time. For error case MDT stop takes # about obd_timeout-60 (240) seconds. Without error - less than 30s. # MDT start takes different time depends on a configuration, let's check diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 3279334..9f968b6 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -1728,6 +1728,39 @@ test_27cf() { } run_test 27cf "'setstripe -o' on inactive OSTs should return error" +test_27cg() { + [[ $($LCTL get_param mdc.*.import) =~ connect_flags.*overstriping ]] || + skip "server does not support overstriping" + [[ $mds1_FSTYPE != "ldiskfs" ]] && skip_env "ldiskfs only test" + large_xattr_enabled || skip_env "ea_inode feature disabled" + + local osts="0" + + for ((i=1;i<1000;i++)); do + osts+=",$((i % OSTCOUNT))" + done + + local mdts=$(comma_list $(mdts_nodes)) + local before=$(do_nodes $mdts \ + "$LCTL get_param -n osd-ldiskfs.*MDT*.stats" | + awk '/many credits/{print $3}' | + calc_sum) + + $LFS setstripe -o $osts $DIR/$tfile || error "setstripe failed" + $LFS getstripe $DIR/$tfile | grep stripe + + rm -f $DIR/$tfile || error "can't unlink" + + after=$(do_nodes $mdts \ + "$LCTL get_param -n osd-ldiskfs.*MDT*.stats" | + awk '/many credits/{print $3}' | + calc_sum) + + (( before == after )) || + error "too many credits happened: $after > $before" +} +run_test 27cg "1000 shouldn't cause too many credits" + test_27d() { test_mkdir $DIR/$tdir $LFS setstripe -c 0 -i -1 -S 0 $DIR/$tdir/$tfile || diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index 8caf62d..b37f519 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -765,6 +765,13 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, } } + if (ldd->ldd_mount_type == LDD_MT_ZFS && + (ldd->ldd_flags & LDD_F_SV_TYPE_OST)) { + rc = add_param(ldd->ldd_params, PARAM_AUTODEGRADE, "on"); + if (rc) + return rc; + } + if (strlen(new_fsname) > 0) { if (!(mop->mo_flags & (MO_FORCEFORMAT | MO_RENAME)) && (!(ldd->ldd_flags & (LDD_F_VIRGIN | LDD_F_WRITECONF)))) { -- 1.8.3.1