From 5fd7183a77a063f3c00285e79b57f37b77ef77ea Mon Sep 17 00:00:00 2001 From: shaver Date: Fri, 16 Aug 2002 01:51:03 +0000 Subject: [PATCH] * Fix interrupt-pending-when-timeout-occurs handling in l_wait_event. * If timeout specified, but no handler, wake up with -ETIMEOUT instead of going back to sleep. * Export a class_signal_client_failure hook-symbol from obdclass, to be filled in by recovd.o and used by various obdclass bits (avoiding sour dependencies on recovd.o). * Add OBD_FAIL_OST_BRW_{READ,WRITE}_BULK fail_loc values, for testing of bulk-xfer timeouts and interrupts. * Fix the timeout in ll_sync_io_cb to scale by HZ. * Rip out some leftovers from ptlrpc_check_reply. --- lustre/include/linux/lustre_lib.h | 67 +++++++++++++++++++++++--------------- lustre/include/linux/obd_class.h | 1 + lustre/include/linux/obd_support.h | 2 ++ lustre/lib/page.c | 9 +++-- lustre/obdclass/class_obd.c | 3 ++ lustre/ost/ost_handler.c | 3 ++ lustre/ptlrpc/client.c | 40 ++--------------------- lustre/ptlrpc/recovd.c | 5 +++ 8 files changed, 60 insertions(+), 70 deletions(-) diff --git a/lustre/include/linux/lustre_lib.h b/lustre/include/linux/lustre_lib.h index 2cdaddb..c1db457 100644 --- a/lustre/include/linux/lustre_lib.h +++ b/lustre/include/linux/lustre_lib.h @@ -454,36 +454,49 @@ do { add_wait_queue(&wq, &__wait); \ __state = TASK_UNINTERRUPTIBLE; \ for (;;) { \ - set_current_state(__state); \ - if (condition) \ + set_current_state(__state); \ + if (condition) \ + break; \ + /* We only become INTERRUPTIBLE if a timeout has fired, and \ + * the caller has given us some signals to care about. \ + * \ + * XXXshaver we should check against info->wli_signals here, \ + * XXXshaver instead of just using l_killable_pending, perhaps. \ + */ \ + if (__state == TASK_INTERRUPTIBLE && \ + l_killable_pending(current)) { \ + CERROR("lwe: interrupt for %d\n", current->pid); \ + if (info->lwi_on_signal) \ + info->lwi_on_signal(info->lwi_cb_data); \ + ret = -EINTR; \ + break; \ + } \ + if (info->lwi_timeout) { \ + if (schedule_timeout(info->lwi_timeout) == 0) { \ + CERROR("lwe: timeout for %d\n", current->pid); \ + if (!info->lwi_on_timeout || \ + info->lwi_on_timeout(info->lwi_cb_data)) { \ + ret = -ETIMEDOUT; \ break; \ - /* We only become INTERRUPTIBLE if a timeout has fired, and \ - * the caller has given us some signals to care about. \ - * \ - * XXXshaver we should check against info->wli_signals here, \ - * XXXshaver instead of just using l_killable_pending, perhaps. \ - */ \ - if (__state == TASK_INTERRUPTIBLE && \ - l_killable_pending(current)) { \ - if (info->lwi_on_signal) \ - info->lwi_on_signal(info->lwi_cb_data); \ - ret = -EINTR; \ - break; \ - } \ - if (info->lwi_timeout) { \ - if (schedule_timeout(info->lwi_timeout) == 0) { \ - /* We'll take signals only after a timeout. */ \ - if (info->lwi_signals) \ - __state = TASK_INTERRUPTIBLE; \ - if (info->lwi_on_timeout && \ - info->lwi_on_timeout(info->lwi_cb_data)) { \ - ret = -ETIMEDOUT; \ - break; \ - } \ + } \ + /* We'll take signals only after a timeout. */ \ + if (info->lwi_signals) { \ + __state = TASK_INTERRUPTIBLE; \ + /* Check for a pending interrupt. */ \ + if (info->lwi_signals && \ + l_killable_pending(current)) { \ + CERROR("lwe: pending interrupt for %d\n", \ + current->pid); \ + if (info->lwi_on_signal) \ + info->lwi_on_signal(info->lwi_cb_data); \ + ret = -EINTR; \ + break; \ } \ - } else { \ - schedule(); \ + } \ } \ + } else { \ + schedule(); \ + } \ } \ current->state = TASK_RUNNING; \ remove_wait_queue(&wq, &__wait); \ diff --git a/lustre/include/linux/obd_class.h b/lustre/include/linux/obd_class.h index bcd8c3b..e631078 100644 --- a/lustre/include/linux/obd_class.h +++ b/lustre/include/linux/obd_class.h @@ -717,6 +717,7 @@ struct obd_export *class_conn2export(struct lustre_handle *); int class_multi_setup(struct obd_device *obddev, uint32_t len, void *data); int class_multi_cleanup(struct obd_device *obddev); +extern void (*class_signal_client_failure)(struct ptlrpc_client *); #endif diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index 3c0a9e1..72f3a94 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -77,6 +77,8 @@ extern unsigned long obd_fail_loc; #define OBD_FAIL_OST_PUNCH_NET 0x20b #define OBD_FAIL_OST_STATFS_NET 0x20c #define OBD_FAIL_OST_HANDLE_UNPACK 0x20d +#define OBD_FAIL_OST_BRW_WRITE_BULK 0x20e +#define OBD_FAIL_OST_BRW_READ_BULK 0x20f #define OBB_FAIL_LDLM 0x300 #define OBD_FAIL_LDLM_NAMESPACE_NEW 0x301 diff --git a/lustre/lib/page.c b/lustre/lib/page.c index 54ed7db..ddcd5de 100644 --- a/lustre/lib/page.c +++ b/lustre/lib/page.c @@ -62,9 +62,8 @@ static int sync_io_timeout(void *data) /* XXXshaver Do we need a resend strategy, or do we just * XXXshaver return -ERESTARTSYS and punt it? */ -#if 0 - recovd_cli_fail(desc->b_client); -#endif + CERROR("signalling failure of client %p\n", desc->b_client); + class_signal_client_failure(desc->b_client); } /* We go back to sleep, until we're resumed or interrupted. */ @@ -87,9 +86,9 @@ int ll_sync_io_cb(struct io_cb_data *data, int err, int phase) ENTRY; if (phase == CB_PHASE_START) { -#warning shaver hardcoded timeout +#warning shaver hardcoded timeout (/proc/sys/lustre/timeout) struct l_wait_info lwi; - lwi = LWI_TIMEOUT_INTR(100, sync_io_timeout, + lwi = LWI_TIMEOUT_INTR(100 * HZ, sync_io_timeout, SIGTERM | SIGKILL | SIGINT, sync_io_intr, data); ret = l_wait_event(data->waitq, data->complete, &lwi); diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 560ec6e..0cd05d3 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -566,6 +566,7 @@ static struct miscdevice obd_psdev = { &obd_psdev_fops }; +void (*class_signal_client_failure)(struct ptlrpc_client *); EXPORT_SYMBOL(obd_dev); EXPORT_SYMBOL(obdo_cachep); @@ -587,6 +588,8 @@ EXPORT_SYMBOL(class_disconnect_all); //EXPORT_SYMBOL(class_multi_setup); //EXPORT_SYMBOL(class_multi_cleanup); +EXPORT_SYMBOL(class_signal_client_failure); + static int __init init_obdclass(void) { int err; diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index bed58f1..ccbf640 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -337,6 +337,9 @@ static int ost_brw_write(struct ptlrpc_request *req) if (req->rq_status) GOTO(out_free, rc = 0); /* XXX is this correct? */ + if (OBD_FAIL_CHECK(OBD_FAIL_OST_BRW_WRITE_BULK)) + GOTO(fail_preprw, rc = 0); + desc = ptlrpc_prep_bulk(req->rq_connection); if (desc == NULL) GOTO(fail_preprw, rc = -ENOMEM); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 31d4e91..f276238 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -269,49 +269,11 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) GOTO(out, rc = 1); } -#if 0 - if (req->rq_flags & PTL_RPC_FL_RESEND) { - if (l_killable_pending(current)) { - CERROR("-- INTR --\n"); - req->rq_flags |= PTL_RPC_FL_INTR; - GOTO(out, rc = 1); - } - CERROR("-- RESEND --\n"); - GOTO(out, rc = 1); - } -#endif - if (req->rq_flags & PTL_RPC_FL_RECOVERY) { CERROR("-- RESTART --\n"); GOTO(out, rc = 1); } - if (req->rq_flags & PTL_RPC_FL_TIMEOUT && l_killable_pending(current)) { - req->rq_flags |= PTL_RPC_FL_INTR; - GOTO(out, rc = 1); - } - - if (req->rq_timeout && - (CURRENT_TIME - req->rq_time >= req->rq_timeout)) { - CERROR("-- REQ TIMEOUT ON CONNID %d XID %Ld --\n", - req->rq_connid, (unsigned long long)req->rq_xid); - /* clear the timeout */ - req->rq_timeout = 0; - req->rq_connection->c_level = LUSTRE_CONN_RECOVD; - req->rq_flags |= PTL_RPC_FL_TIMEOUT; - if (req->rq_client && req->rq_client->cli_recovd) - recovd_cli_fail(req->rq_client); - if (req->rq_level < LUSTRE_CONN_FULL) { - rc = 1; - } else if (l_killable_pending(current)) { - req->rq_flags |= PTL_RPC_FL_INTR; - rc = 1; - } else { - rc = 0; - } - GOTO(out, rc); - } - out: CDEBUG(D_NET, "req = %p, rc = %d\n", req, rc); return rc; @@ -477,6 +439,8 @@ static int expired_request(void *data) struct ptlrpc_request *req = data; ENTRY; + CERROR("req timeout on connid %d xid %Ld\n", req->rq_connid, + (unsigned long long)req->rq_xid); req->rq_timeout = 0; req->rq_connection->c_level = LUSTRE_CONN_RECOVD; req->rq_flags |= PTL_RPC_FL_TIMEOUT; diff --git a/lustre/ptlrpc/recovd.c b/lustre/ptlrpc/recovd.c index 6875531..1c8037c 100644 --- a/lustre/ptlrpc/recovd.c +++ b/lustre/ptlrpc/recovd.c @@ -194,6 +194,8 @@ static int recovd_main(void *arg) int recovd_setup(struct recovd_obd *recovd) { int rc; + extern void (*class_signal_client_failure)(struct ptlrpc_client *); + ENTRY; INIT_LIST_HEAD(&recovd->recovd_clients_lh); @@ -212,6 +214,9 @@ int recovd_setup(struct recovd_obd *recovd) } wait_event(recovd->recovd_ctl_waitq, recovd->recovd_flags & RECOVD_IDLE); + /* exported and called by obdclass timeout handlers */ + class_signal_client_failure = recovd_cli_fail; + RETURN(0); } -- 1.8.3.1