From: nathan Date: Thu, 11 Oct 2007 19:52:12 +0000 (+0000) Subject: b=10676 X-Git-Tag: v1_7_0_51~634 X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=commitdiff_plain;h=c5a986fbb74a2d4d2a89861d7aa2ae0deebee775 b=10676 i=adilger i=wangdi enhanced obd_fail functionality --- diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 8452374..ca70d84 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -29,6 +29,7 @@ extern atomic_t obd_memory; extern int obd_memmax; extern unsigned int obd_fail_loc; +extern unsigned int obd_fail_val; extern unsigned int obd_debug_peer_on_timeout; extern unsigned int obd_dump_on_timeout; extern unsigned int obd_dump_on_eviction; @@ -225,39 +226,87 @@ extern int obd_race_state; #define OBD_FAIL_SEC_CTX_INIT_CONT_NET 0x1202 #define OBD_FAIL_SEC_CTX_FINI_NET 0x1203 -/* preparation for a more advanced failure testbed (not functional yet) */ +/* Failure injection control */ #define OBD_FAIL_MASK_SYS 0x0000FF00 -#define OBD_FAIL_MASK_LOC (0x000000FF | OBD_FAIL_MASK_SYS) +#define OBD_FAIL_MASK_LOC (0x000000FF | OBD_FAIL_MASK_SYS) #define OBD_FAIL_ONCE 0x80000000 #define OBD_FAILED 0x40000000 -#define OBD_FAIL_CHECK(id) (((obd_fail_loc & OBD_FAIL_MASK_LOC) == \ - ((id) & OBD_FAIL_MASK_LOC)) && \ - ((obd_fail_loc & (OBD_FAILED | OBD_FAIL_ONCE))!= \ - (OBD_FAILED | OBD_FAIL_ONCE))) +/* The following flags aren't made to be combined */ +#define OBD_FAIL_SKIP 0x20000000 /* skip N then fail */ +#define OBD_FAIL_SOME 0x10000000 /* fail N times */ +#define OBD_FAIL_RAND 0x08000000 /* fail 1/N of the time */ +#define OBD_FAIL_USR1 0x04000000 /* user flag */ -#define OBD_FAIL_CHECK_ONCE(id) \ -({ int _ret_ = 0; \ - if (unlikely(OBD_FAIL_CHECK(id))) { \ - CERROR("*** obd_fail_loc=0x%x ***\n", id); \ - obd_fail_loc |= OBD_FAILED; \ - if ((id) & OBD_FAIL_ONCE) \ - obd_fail_loc |= OBD_FAIL_ONCE; \ - _ret_ = 1; \ +static inline int obd_fail_check(__u32 id) +{ + static int count = 0; + if (likely((obd_fail_loc & OBD_FAIL_MASK_LOC) != + (id & OBD_FAIL_MASK_LOC))) + return 0; + + if ((obd_fail_loc & (OBD_FAILED | OBD_FAIL_ONCE)) == + (OBD_FAILED | OBD_FAIL_ONCE)) { + count = 0; /* paranoia */ + return 0; + } + + if (obd_fail_loc & OBD_FAIL_RAND) { + unsigned int ll_rand(void); + if (obd_fail_val < 2) + return 0; + if (ll_rand() % obd_fail_val > 0) + return 0; + } + + if (obd_fail_loc & OBD_FAIL_SKIP) { + count++; + if (count < obd_fail_val) + return 0; + count = 0; + } + + /* Overridden by FAIL_ONCE */ + if (obd_fail_loc & OBD_FAIL_SOME) { + count++; + if (count >= obd_fail_val) { + count = 0; + /* Don't fail anymore */ + obd_fail_loc |= OBD_FAIL_ONCE; + } + } + + obd_fail_loc |= OBD_FAILED; + /* Handle old checks that OR in this */ + if (id & OBD_FAIL_ONCE) + obd_fail_loc |= OBD_FAIL_ONCE; + + return 1; +} + +#define OBD_FAIL_CHECK(id) \ +({ \ + int _ret_ = 0; \ + if (unlikely(obd_fail_loc && (_ret_ = obd_fail_check(id)))) { \ + CERROR("*** obd_fail_loc=%x ***\n", id); \ } \ _ret_; \ }) +/* deprecated - just use OBD_FAIL_CHECK */ +#define OBD_FAIL_CHECK_ONCE OBD_FAIL_CHECK + #define OBD_FAIL_RETURN(id, ret) \ do { \ - if (unlikely(OBD_FAIL_CHECK_ONCE(id))) { \ + if (unlikely(obd_fail_loc && obd_fail_check(id))) { \ + CERROR("*** obd_fail_return=%x rc=%d ***\n", id, ret); \ RETURN(ret); \ } \ } while(0) #define OBD_FAIL_TIMEOUT(id, secs) \ -do { \ - if (unlikely(OBD_FAIL_CHECK_ONCE(id))) { \ +({ int _ret_ = 0; \ + if (unlikely(obd_fail_loc && (_ret_ = obd_fail_check(id)))) { \ CERROR("obd_fail_timeout id %x sleeping for %d secs\n", \ (id), (secs)); \ set_current_state(TASK_UNINTERRUPTIBLE); \ @@ -265,8 +314,23 @@ do { \ cfs_time_seconds(secs)); \ set_current_state(TASK_RUNNING); \ CERROR("obd_fail_timeout id %x awake\n", (id)); \ - } \ -} while(0) + } \ + _ret_; \ +}) + +#define OBD_FAIL_TIMEOUT_MS(id, ms) \ +({ int _ret_ = 0; \ + if (unlikely(obd_fail_loc && (_ret_ = obd_fail_check(id)))) { \ + CERROR("obd_fail_timeout id %x sleeping for %d ms\n", \ + (id), (ms)); \ + set_current_state(TASK_UNINTERRUPTIBLE); \ + cfs_schedule_timeout(CFS_TASK_UNINT, \ + cfs_time_seconds(ms)/1000); \ + set_current_state(TASK_RUNNING); \ + CERROR("obd_fail_timeout id %x awake\n", (id)); \ + } \ + _ret_; \ +}) #ifdef __KERNEL__ /* The idea here is to synchronise two threads to force a race. The @@ -275,7 +339,7 @@ do { \ * the first and continues. */ #define OBD_RACE(id) \ do { \ - if (unlikely(OBD_FAIL_CHECK_ONCE(id))) { \ + if (unlikely(obd_fail_loc && obd_fail_check(id))) { \ obd_race_state = 0; \ CERROR("obd_race id %x sleeping\n", (id)); \ OBD_SLEEP_ON(obd_race_waitq, obd_race_state != 0); \ diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 54fd4ab..6ea17ac 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -2024,7 +2024,7 @@ static int mdt_req_handle(struct mdt_thread_info *info, LASSERT(current->journal_info == NULL); /* - * Do not use *_FAIL_CHECK_ONCE() macros, because they will stop + * Mask out OBD_FAIL_ONCE, because that will stop * correct handling of failed req later in ldlm due to doing * obd_fail_loc |= OBD_FAIL_ONCE | OBD_FAILED without actually * correct actions like it is done in target_send_reply_msg(). @@ -2034,7 +2034,7 @@ static int mdt_req_handle(struct mdt_thread_info *info, * Set to info->mti_fail_id to handler fail_id, it will be used * later, and better than use default fail_id. */ - if (OBD_FAIL_CHECK(h->mh_fail_id)) { + if (OBD_FAIL_CHECK(h->mh_fail_id && OBD_FAIL_MASK_LOC)) { info->mti_fail_id = h->mh_fail_id; RETURN(0); } diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index a429e15..84753fb 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -58,6 +58,7 @@ int obd_memmax; /* The following are visible and mutable through /proc/sys/lustre/. */ unsigned int obd_fail_loc; +unsigned int obd_fail_val; unsigned int obd_debug_peer_on_timeout; unsigned int obd_dump_on_timeout; unsigned int obd_dump_on_eviction; @@ -376,6 +377,7 @@ void *obd_psdev = NULL; EXPORT_SYMBOL(obd_devs); EXPORT_SYMBOL(obd_fail_loc); +EXPORT_SYMBOL(obd_fail_val); EXPORT_SYMBOL(obd_print_fail_loc); EXPORT_SYMBOL(obd_race_waitq); EXPORT_SYMBOL(obd_race_state); diff --git a/lustre/obdclass/linux/linux-sysctl.c b/lustre/obdclass/linux/linux-sysctl.c index 4fd277c..ba3d186 100644 --- a/lustre/obdclass/linux/linux-sysctl.c +++ b/lustre/obdclass/linux/linux-sysctl.c @@ -49,6 +49,7 @@ cfs_sysctl_table_header_t *obd_table_header = NULL; enum { OBD_FAIL_LOC = 1, /* control test failures instrumentation */ + OBD_FAIL_VAL, /* userdata for fail loc */ OBD_TIMEOUT, /* RPC timeout before recovery/intr */ OBD_DUMP_ON_TIMEOUT, /* dump kernel debug log upon eviction */ OBD_MEMUSED, /* bytes currently OBD_ALLOCated */ @@ -89,6 +90,14 @@ static cfs_sysctl_table_t obd_table[] = { .proc_handler = &proc_fail_loc }, { + .ctl_name = OBD_FAIL_VAL, + .procname = "fail_val", + .data = &obd_fail_val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .ctl_name = OBD_TIMEOUT, .procname = "timeout", .data = &obd_timeout, diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index c469ef0..3a244b0 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -1636,8 +1636,7 @@ static int filter_intent_policy(struct ldlm_namespace *ns, * list (and potentially being added to l_pending_list by an * AST) when we are going to drop this lock ASAP. */ if (lock->l_export->exp_libclient || - OBD_FAIL_CHECK(OBD_FAIL_LDLM_GLIMPSE)) { - OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_GLIMPSE, 2); + OBD_FAIL_TIMEOUT(OBD_FAIL_LDLM_GLIMPSE, 2)) { ldlm_resource_unlink_lock(lock); err = ELDLM_LOCK_ABORTED; } else { diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 481022c..7f3a1f6 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -227,6 +227,7 @@ unload_modules() { echo "$LEAK_PORTALS" 1>&2 mv $TMP/debug $TMP/debug-leak.`date +%s` || true echo "Memory leaks detected" + [ -n "$IGNORE_LEAK" ] && echo "ignoring leaks" && return 0 return 254 fi echo "modules unloaded." diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c index 74d2910..0589431 100644 --- a/lustre/utils/mount_lustre.c +++ b/lustre/utils/mount_lustre.c @@ -37,6 +37,7 @@ #include "obdctl.h" #include #include +#include #define MAX_HW_SECTORS_KB_PATH "queue/max_hw_sectors_kb" #define MAX_SECTORS_KB_PATH "queue/max_sectors_kb" @@ -186,6 +187,7 @@ static const struct opt_map opt_map[] = { { "nosuid", 0, MS_NOSUID }, /* don't honor suid executables */ { "dev", 1, MS_NODEV }, /* interpret device files */ { "nodev", 0, MS_NODEV }, /* don't interpret devices */ + { "sync", 0, MS_SYNCHRONOUS}, /* synchronous I/O */ { "async", 1, MS_SYNCHRONOUS}, /* asynchronous I/O */ { "auto", 0, 0 }, /* Can be mounted using -a */ { "noauto", 0, 0 }, /* Can only be mounted explicitly */ @@ -328,8 +330,9 @@ int set_tunables(char *source, int src_len) rc = stat(dev, &stat_buf); if (rc) { - fprintf(stderr, "warning: %s, device %s stat failed\n", - strerror(errno), dev); + if (verbose) + fprintf(stderr, "warning: %s, device %s stat failed\n", + strerror(errno), dev); return rc; } @@ -337,8 +340,9 @@ int set_tunables(char *source, int src_len) minor = minor(stat_buf.st_rdev); rc = glob("/sys/block/*", GLOB_NOSORT, NULL, &glob_info); if (rc) { - fprintf(stderr, "warning: failed to read entries under " - "/sys/block\n"); + if (verbose) + fprintf(stderr, "warning: failed to read entries under " + "/sys/block\n"); return rc; }