Whamcloud - gitweb
* Fix interrupt-pending-when-timeout-occurs handling in l_wait_event.
authorshaver <shaver>
Fri, 16 Aug 2002 01:51:03 +0000 (01:51 +0000)
committershaver <shaver>
Fri, 16 Aug 2002 01:51:03 +0000 (01:51 +0000)
* If timeout specified, but no handler, wake up with -ETIMEOUT instead of
  going back to sleep.
* Export a class_signal_client_failure hook-symbol from obdclass, to be filled
  in by recovd.o and used by various obdclass bits (avoiding sour dependencies
  on recovd.o).
* Add OBD_FAIL_OST_BRW_{READ,WRITE}_BULK fail_loc values, for testing of
  bulk-xfer timeouts and interrupts.
* Fix the timeout in ll_sync_io_cb to scale by HZ.
* Rip out some leftovers from ptlrpc_check_reply.

lustre/include/linux/lustre_lib.h
lustre/include/linux/obd_class.h
lustre/include/linux/obd_support.h
lustre/lib/page.c
lustre/obdclass/class_obd.c
lustre/ost/ost_handler.c
lustre/ptlrpc/client.c
lustre/ptlrpc/recovd.c

index 2cdaddb..c1db457 100644 (file)
@@ -454,36 +454,49 @@ do {
         add_wait_queue(&wq, &__wait);                                           \
         __state = TASK_UNINTERRUPTIBLE;                                         \
         for (;;) {                                                              \
-                set_current_state(__state);                                     \
-                if (condition)                                                  \
+            set_current_state(__state);                                         \
+            if (condition)                                                      \
+                    break;                                                      \
+            /* We only become INTERRUPTIBLE if a timeout has fired, and         \
+             * the caller has given us some signals to care about.              \
+             *                                                                  \
+             * XXXshaver we should check against info->wli_signals here,        \
+             * XXXshaver instead of just using l_killable_pending, perhaps.     \
+             */                                                                 \
+            if (__state == TASK_INTERRUPTIBLE &&                                \
+                l_killable_pending(current)) {                                  \
+                    CERROR("lwe: interrupt for %d\n", current->pid);            \
+                    if (info->lwi_on_signal)                                    \
+                            info->lwi_on_signal(info->lwi_cb_data);             \
+                    ret = -EINTR;                                               \
+                    break;                                                      \
+            }                                                                   \
+            if (info->lwi_timeout) {                                            \
+                if (schedule_timeout(info->lwi_timeout) == 0) {                 \
+                    CERROR("lwe: timeout for %d\n", current->pid);              \
+                    if (!info->lwi_on_timeout ||                                \
+                        info->lwi_on_timeout(info->lwi_cb_data)) {              \
+                        ret = -ETIMEDOUT;                                       \
                         break;                                                  \
-                /* We only become INTERRUPTIBLE if a timeout has fired, and     \
-                 * the caller has given us some signals to care about.          \
-                 *                                                              \
-                 * XXXshaver we should check against info->wli_signals here,    \
-                 * XXXshaver instead of just using l_killable_pending, perhaps. \
-                 */                                                             \
-                if (__state == TASK_INTERRUPTIBLE &&                            \
-                    l_killable_pending(current)) {                              \
-                        if (info->lwi_on_signal)                                \
-                                info->lwi_on_signal(info->lwi_cb_data);         \
-                        ret = -EINTR;                                           \
-                        break;                                                  \
-                }                                                               \
-                if (info->lwi_timeout) {                                        \
-                        if (schedule_timeout(info->lwi_timeout) == 0) {         \
-                                /* We'll take signals only after a timeout. */  \
-                                if (info->lwi_signals)                          \
-                                        __state = TASK_INTERRUPTIBLE;           \
-                                if (info->lwi_on_timeout &&                     \
-                                    info->lwi_on_timeout(info->lwi_cb_data)) {  \
-                                        ret = -ETIMEDOUT;                       \
-                                        break;                                  \
-                                }                                               \
+                    }                                                           \
+                    /* We'll take signals only after a timeout. */              \
+                    if (info->lwi_signals) {                                    \
+                        __state = TASK_INTERRUPTIBLE;                           \
+                        /* Check for a pending interrupt. */                    \
+                        if (info->lwi_signals &&                                \
+                            l_killable_pending(current)) {                      \
+                             CERROR("lwe: pending interrupt for %d\n",          \
+                                    current->pid);                              \
+                             if (info->lwi_on_signal)                           \
+                                 info->lwi_on_signal(info->lwi_cb_data);        \
+                             ret = -EINTR;                                      \
+                             break;                                             \
                         }                                                       \
-                } else {                                                        \
-                        schedule();                                             \
+                    }                                                           \
                 }                                                               \
+            } else {                                                            \
+                schedule();                                                     \
+            }                                                                   \
         }                                                                       \
         current->state = TASK_RUNNING;                                          \
         remove_wait_queue(&wq, &__wait);                                        \
index bcd8c3b..e631078 100644 (file)
@@ -717,6 +717,7 @@ struct obd_export *class_conn2export(struct lustre_handle *);
 int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data);
 int class_multi_cleanup(struct obd_device *obddev);
 
+extern void (*class_signal_client_failure)(struct ptlrpc_client *);
 
 #endif
 
index 3c0a9e1..72f3a94 100644 (file)
@@ -77,6 +77,8 @@ extern unsigned long obd_fail_loc;
 #define OBD_FAIL_OST_PUNCH_NET           0x20b
 #define OBD_FAIL_OST_STATFS_NET          0x20c
 #define OBD_FAIL_OST_HANDLE_UNPACK       0x20d
+#define OBD_FAIL_OST_BRW_WRITE_BULK      0x20e
+#define OBD_FAIL_OST_BRW_READ_BULK       0x20f
 
 #define OBB_FAIL_LDLM                    0x300
 #define OBD_FAIL_LDLM_NAMESPACE_NEW      0x301
index 54ed7db..ddcd5de 100644 (file)
@@ -62,9 +62,8 @@ static int sync_io_timeout(void *data)
                 /* XXXshaver Do we need a resend strategy, or do we just
                  * XXXshaver return -ERESTARTSYS and punt it?
                  */
-#if 0
-                recovd_cli_fail(desc->b_client);
-#endif
+                CERROR("signalling failure of client %p\n", desc->b_client);
+                class_signal_client_failure(desc->b_client);
         }
 
         /* We go back to sleep, until we're resumed or interrupted. */
@@ -87,9 +86,9 @@ int ll_sync_io_cb(struct io_cb_data *data, int err, int phase)
         ENTRY; 
 
         if (phase == CB_PHASE_START) { 
-#warning shaver hardcoded timeout
+#warning shaver hardcoded timeout (/proc/sys/lustre/timeout)
                 struct l_wait_info lwi;
-                lwi = LWI_TIMEOUT_INTR(100, sync_io_timeout,
+                lwi = LWI_TIMEOUT_INTR(100 * HZ, sync_io_timeout,
                                        SIGTERM | SIGKILL | SIGINT, sync_io_intr,
                                        data);
                 ret = l_wait_event(data->waitq, data->complete, &lwi);
index 560ec6e..0cd05d3 100644 (file)
@@ -566,6 +566,7 @@ static struct miscdevice obd_psdev = {
         &obd_psdev_fops
 };
 
+void (*class_signal_client_failure)(struct ptlrpc_client *);
 
 EXPORT_SYMBOL(obd_dev);
 EXPORT_SYMBOL(obdo_cachep);
@@ -587,6 +588,8 @@ EXPORT_SYMBOL(class_disconnect_all);
 //EXPORT_SYMBOL(class_multi_setup);
 //EXPORT_SYMBOL(class_multi_cleanup);
 
+EXPORT_SYMBOL(class_signal_client_failure);
+
 static int __init init_obdclass(void)
 {
         int err;
index bed58f1..ccbf640 100644 (file)
@@ -337,6 +337,9 @@ static int ost_brw_write(struct ptlrpc_request *req)
         if (req->rq_status)
                 GOTO(out_free, rc = 0); /* XXX is this correct? */
 
+        if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK))
+                GOTO(fail_preprw, rc = 0);
+
         desc = ptlrpc_prep_bulk(req->rq_connection);
         if (desc == NULL)
                 GOTO(fail_preprw, rc = -ENOMEM);
index 31d4e91..f276238 100644 (file)
@@ -269,49 +269,11 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req)
                 GOTO(out, rc = 1);
         }
 
-#if 0
-        if (req->rq_flags & PTL_RPC_FL_RESEND) { 
-                if (l_killable_pending(current)) {
-                        CERROR("-- INTR --\n");
-                        req->rq_flags |= PTL_RPC_FL_INTR;
-                        GOTO(out, rc = 1);
-                }
-                CERROR("-- RESEND --\n");
-                GOTO(out, rc = 1);
-        }
-#endif
-
         if (req->rq_flags & PTL_RPC_FL_RECOVERY) { 
                 CERROR("-- RESTART --\n");
                 GOTO(out, rc = 1);
         }
 
-        if (req->rq_flags & PTL_RPC_FL_TIMEOUT && l_killable_pending(current)) {
-                req->rq_flags |= PTL_RPC_FL_INTR;
-                GOTO(out, rc = 1);
-        }
-
-        if (req->rq_timeout &&
-            (CURRENT_TIME - req->rq_time >= req->rq_timeout)) {
-                CERROR("-- REQ TIMEOUT ON CONNID %d XID %Ld --\n",
-                       req->rq_connid, (unsigned long long)req->rq_xid);
-                /* clear the timeout */
-                req->rq_timeout = 0;
-                req->rq_connection->c_level = LUSTRE_CONN_RECOVD;
-                req->rq_flags |= PTL_RPC_FL_TIMEOUT;
-                if (req->rq_client && req->rq_client->cli_recovd)
-                        recovd_cli_fail(req->rq_client);
-                if (req->rq_level < LUSTRE_CONN_FULL) {
-                        rc = 1;
-                } else if (l_killable_pending(current)) {
-                        req->rq_flags |= PTL_RPC_FL_INTR;
-                        rc = 1;
-                } else {
-                        rc = 0;
-                }
-                GOTO(out, rc);
-        }
-
  out:
         CDEBUG(D_NET, "req = %p, rc = %d\n", req, rc);
         return rc;
@@ -477,6 +439,8 @@ static int expired_request(void *data)
         struct ptlrpc_request *req = data;
         
         ENTRY;
+        CERROR("req timeout on connid %d xid %Ld\n", req->rq_connid,
+               (unsigned long long)req->rq_xid);
         req->rq_timeout = 0;
         req->rq_connection->c_level = LUSTRE_CONN_RECOVD;
         req->rq_flags |= PTL_RPC_FL_TIMEOUT;
index 6875531..1c8037c 100644 (file)
@@ -194,6 +194,8 @@ static int recovd_main(void *arg)
 int recovd_setup(struct recovd_obd *recovd)
 {
         int rc;
+        extern void (*class_signal_client_failure)(struct ptlrpc_client *);
+
         ENTRY;
 
         INIT_LIST_HEAD(&recovd->recovd_clients_lh);
@@ -212,6 +214,9 @@ int recovd_setup(struct recovd_obd *recovd)
         }
         wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE);
 
+        /* exported and called by obdclass timeout handlers */
+        class_signal_client_failure = recovd_cli_fail;
+
         RETURN(0);
 }