Whamcloud - gitweb
b=2898
authorrread <rread>
Sat, 13 Mar 2004 02:13:23 +0000 (02:13 +0000)
committerrread <rread>
Sat, 13 Mar 2004 02:13:23 +0000 (02:13 +0000)
Protect conn_cnt check and update with exp_lock in
target_handle_connect.

Add a test to replay since, and add an OBD_FAIL style macro to create
a race condition in a specific location called OBD_RACE.

lustre/include/linux/obd_support.h
lustre/ldlm/ldlm_lib.c
lustre/obdclass/class_obd.c
lustre/obdclass/lprocfs_status.c
lustre/obdclass/sysctl.c
lustre/tests/replay-single.sh

index 41fb301..c39cb6f 100644 (file)
@@ -39,6 +39,7 @@ extern unsigned int obd_fail_loc;
 extern unsigned int obd_timeout;
 extern char obd_lustre_upcall[128];
 extern unsigned int obd_sync_filter;
+extern wait_queue_head_t obd_race_waitq;
 
 #define OBD_FAIL_MDS                     0x100
 #define OBD_FAIL_MDS_HANDLE_UNPACK       0x101
@@ -129,6 +130,7 @@ extern unsigned int obd_sync_filter;
 #define OBD_FAIL_OBD_LOGD_NET            0x602
 
 #define OBD_FAIL_TGT_REPLY_NET           0x700
+#define OBD_FAIL_TGT_CONN_RACE           0x701
 
 /* preparation for a more advanced failure testbed (not functional yet) */
 #define OBD_FAIL_MASK_SYS    0x0000FF00
@@ -174,6 +176,22 @@ do {                                                                         \
        }                                                                     \
 } while(0)
 
+/* The idea here is to synchronise two threads to force a race. The
+ * first thread that calls this with a matching fail_loc is put to
+ * sleep. The next thread that calls with the same fail_loc wakes up
+ * the first and continues. */
+#define OBD_RACE(id)                                            \
+do {                                                            \
+        if  (OBD_FAIL_CHECK_ONCE(id)) {                         \
+                CERROR("obd_race id %x sleeping\n", (id));      \
+                sleep_on(&obd_race_waitq);                      \
+                CERROR("obd_fail_race id %x awake\n", (id));    \
+        } else if ((obd_fail_loc & OBD_FAIL_MASK_LOC) ==        \
+                    ((id) & OBD_FAIL_MASK_LOC)) {               \
+                wake_up(&obd_race_waitq);                       \
+        }                                                       \
+} while(0)
+
 #define fixme() CDEBUG(D_OTHER, "FIXME\n");
 
 #ifdef __KERNEL__
index bcaed00..0d514db 100644 (file)
@@ -372,8 +372,11 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         struct list_head *p;
         char *str, *tmp;
         int rc = 0, abort_recovery;
+        unsigned long flags;
         ENTRY;
 
+        OBD_RACE(OBD_FAIL_TGT_CONN_RACE); 
+
         LASSERT_REQSWAB (req, 0);
         str = lustre_msg_string(req->rq_reqmsg, 0, sizeof(tgtuuid) - 1);
         if (str == NULL) {
@@ -386,7 +389,7 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         if (!target) {
                 target = class_name2obd(str);
         }
-
+        
         if (!target || target->obd_stopping || !target->obd_set_up) {
                 CERROR("UUID '%s' is not available for connect\n", str);
                 GOTO(out, rc = -ENODEV);
@@ -498,6 +501,17 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         export = req->rq_export = class_conn2export(&conn);
         LASSERT(export != NULL);
 
+        spin_lock_irqsave(&export->exp_lock, flags);
+        if (export->exp_conn_cnt >= req->rq_reqmsg->conn_cnt) {
+                CERROR("%s: already connected at a higher conn_cnt: %d > %d\n",
+                       cluuid.uuid, export->exp_conn_cnt, 
+                       req->rq_reqmsg->conn_cnt);
+                spin_unlock_irqrestore(&export->exp_lock, flags);
+                GOTO(out, rc = -EALREADY);
+        }
+        export->exp_conn_cnt = req->rq_reqmsg->conn_cnt;
+        spin_unlock_irqrestore(&export->exp_lock, flags);
+
         /* request from liblustre? */
         if (lustre_msg_get_op_flags(req->rq_reqmsg) & MSG_CONNECT_LIBCLIENT)
                 export->exp_libclient = 1;
@@ -507,9 +521,6 @@ int target_handle_connect(struct ptlrpc_request *req, svc_handler_t handler)
         export->exp_connection = ptlrpc_get_connection(&req->rq_peer,
                                                        &remote_uuid);
 
-        LASSERT(export->exp_conn_cnt < req->rq_reqmsg->conn_cnt);
-        export->exp_conn_cnt = req->rq_reqmsg->conn_cnt;
-
         if (rc == EALREADY) {
                 /* We indicate the reconnection in a flag, not an error code. */
                 lustre_msg_add_op_flags(req->rq_repmsg, MSG_CONNECT_RECONNECT);
index 5e2c305..e3351a6 100644 (file)
@@ -88,6 +88,8 @@ unsigned int obd_timeout = 100;
 char obd_lustre_upcall[128] = "DEFAULT"; /* or NONE or /full/path/to/upcall  */
 unsigned int obd_sync_filter; /* = 0, don't sync by default */
 
+DECLARE_WAIT_QUEUE_HEAD(obd_race_waitq);
+
 #ifdef __KERNEL__
 /*  opening /dev/obd */
 static int obd_class_open(struct inode * inode, struct file * file)
@@ -375,6 +377,7 @@ void *obd_psdev = NULL;
 EXPORT_SYMBOL(obd_dev);
 EXPORT_SYMBOL(obdo_cachep);
 EXPORT_SYMBOL(obd_fail_loc);
+EXPORT_SYMBOL(obd_race_waitq);
 EXPORT_SYMBOL(obd_timeout);
 EXPORT_SYMBOL(obd_lustre_upcall);
 EXPORT_SYMBOL(obd_sync_filter);
index 119ca99..a90a6e1 100644 (file)
@@ -138,7 +138,7 @@ void lprocfs_remove(struct proc_dir_entry *root)
         LASSERT(root != NULL);
         parent = root->parent;
         LASSERT(parent != NULL);
-
         while (1) {
                 while (temp->subdir != NULL)
                         temp = temp->subdir;
index f474985..8c93a48 100644 (file)
@@ -54,11 +54,14 @@ enum {
         OBD_SYNCFILTER,         /* XXX temporary, as we play with sync osts.. */
 };
 
+int proc_fail_loc(ctl_table *table, int write, struct file *filp,
+                  void *buffer, size_t *lenp);
+
 static ctl_table obd_table[] = {
         {OBD_FAIL_LOC, "fail_loc", &obd_fail_loc, sizeof(int), 0644, NULL,
                 &proc_dointvec},
         {OBD_TIMEOUT, "timeout", &obd_timeout, sizeof(int), 0644, NULL,
-                &proc_dointvec},
+                &proc_fail_loc},
         /* XXX need to lock so we avoid update races with recovery upcall! */
         {OBD_UPCALL, "upcall", obd_lustre_upcall, 128, 0644, NULL,
                 &proc_dostring, &sysctl_string },
@@ -88,3 +91,15 @@ void obd_sysctl_clean (void)
         obd_table_header = NULL;
 #endif
 }
+
+int proc_fail_loc(ctl_table *table, int write, struct file *filp,
+                  void *buffer, size_t *lenp)
+{
+        int rc;
+        int old_fail_loc = obd_fail_loc;
+
+        rc = proc_dointvec(table,write,filp,buffer,lenp);
+        if (old_fail_loc != obd_fail_loc)
+                wake_up(&obd_race_waitq);
+        return rc;
+}
index ef241b2..8b1c6e3 100755 (executable)
@@ -854,6 +854,16 @@ test_43() {
 }
 run_test 43 "mds osc import failure during recovery; don't LBUG"
 
+test_44() {
+    mdcdev=`awk '/mds_svc_MNT/ {print $1}' < /proc/fs/lustre/devices`
+    do_facet mds "sysctl -w lustre.fail_loc=0x80000701"
+    $LCTL --device $mdcdev recover
+    df $MOUNT
+    do_facet mds "sysctl -w lustre.fail_loc=0"
+    return 0
+}
+run_test 44 "race in target handle connect"
+
 equals_msg test complete, cleaning up
 $CLEANUP