static int expired_completion_wait(void *data)
{
struct ldlm_lock *lock = data;
+ struct ptlrpc_connection *conn;
+ struct obd_device *obd;
+
if (!lock)
CERROR("NULL lock\n");
- else if (!lock->l_export)
- CERROR("lock %p has NULL export\n", lock);
- else
- class_signal_connection_failure(lock->l_export->exp_connection);
+ else if (!lock->l_connh)
+ CERROR("lock %p has NULL connh\n", lock);
+ else if (!(obd = class_conn2obd(lock->l_connh)))
+ CERROR("lock %p has NULL obd\n", lock);
+ else if (!(conn = obd->u.cli.cl_import.imp_connection))
+ CERROR("lock %p has NULL connection\n", lock);
+ else {
+ class_signal_connection_failure(conn);
+ }
+ RETURN(0);
+}
+
+#if 0
+static int expired_completion_wait(void *data)
+{
+ struct ldlm_lock *lock = data;
+ struct ptlrpc_connection *conn =
+ class_conn2cliimp(lock->l_connh)->imp_connection;
+
+ if (!conn) {
+ CERROR("lock %p has NULL import connection\n", lock);
+ RETURN(1);
+ }
+
+ class_signal_connection_failure(conn);
RETURN(0);
}
+#endif
int ldlm_completion_ast(struct ldlm_lock *lock, int flags)
{
rd->rd_recovd = recovd;
rd->rd_recover = recover;
+ rd->rd_phase = RD_IDLE;
+ rd->rd_next_phase = RD_TROUBLED;
spin_lock(&recovd->recovd_lock);
list_add(&rd->rd_managed_chain, &recovd->recovd_managed_items);
spin_lock(&recovd->recovd_lock);
- if (rd->rd_phase != RECOVD_IDLE || rd->rd_next_phase != RECOVD_IDLE) {
+ if (rd->rd_phase != RD_IDLE) {
CDEBUG(D_INFO, "connection %p to %s already in recovery\n",
conn, conn->c_remote_uuid);
+ /* XXX need to distinguish from failure-in-recovery */
spin_unlock(&recovd->recovd_lock);
EXIT;
return;
CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
list_del(&rd->rd_managed_chain);
list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
- rd->rd_next_phase = RECOVD_PREPARING;
+ rd->rd_phase = RD_TROUBLED;
spin_unlock(&recovd->recovd_lock);
wake_up(&recovd->recovd_waitq);
EXIT;
}
-/* this function must be called with conn->c_lock held */
+/* this function must be called with recovd->recovd_lock held */
void recovd_conn_fixed(struct ptlrpc_connection *conn)
{
struct recovd_data *rd = &conn->c_recovd_data;
ENTRY;
+ spin_lock(&rd->rd_recovd->recovd_lock);
list_del(&rd->rd_managed_chain);
+ rd->rd_phase = RD_IDLE;
+ rd->rd_next_phase = RD_TROUBLED;
list_add(&rd->rd_managed_chain, &rd->rd_recovd->recovd_managed_items);
+ spin_unlock(&rd->rd_recovd->recovd_lock);
EXIT;
}
rd_managed_chain);
if (rd->rd_phase == rd->rd_next_phase ||
- (rd->rd_phase == RECOVD_IDLE &&
- rd->rd_next_phase == RECOVD_PREPARING) ||
- rd->rd_phase == RECOVD_FAILED)
+ rd->rd_phase == RD_FAILED)
GOTO(out, rc = 1);
}
struct recovd_data *rd = list_entry(tmp, struct recovd_data,
rd_managed_chain);
- /* XXXshaver This is very ugly -- add a RECOVD_TROUBLED state! */
- if (rd->rd_phase != RECOVD_FAILED &&
- !(rd->rd_phase == RECOVD_IDLE &&
- rd->rd_next_phase == RECOVD_PREPARING) &&
+ if (rd->rd_phase != RD_FAILED &&
rd->rd_phase != rd->rd_next_phase)
continue;
switch (rd->rd_phase) {
- case RECOVD_FAILED:
+ case RD_FAILED:
cb_failed: /* must always reach here with recovd_lock held! */
CERROR("recovery FAILED for rd %p (conn %p): %d\n",
rd, class_rd2conn(rd), rc);
spin_lock(&recovd->recovd_lock);
break;
- case RECOVD_IDLE:
+ case RD_TROUBLED:
if (!rd->rd_recover) {
CERROR("no rd_recover for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
}
CERROR("starting recovery for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
- rd->rd_phase = RECOVD_PREPARING;
+ rd->rd_phase = RD_PREPARING;
spin_unlock(&recovd->recovd_lock);
rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
if (rc)
goto cb_failed;
- rd->rd_next_phase = RECOVD_PREPARED;
+ rd->rd_next_phase = RD_PREPARED;
break;
- case RECOVD_PREPARED:
- rd->rd_phase = RECOVD_RECOVERING;
+ case RD_PREPARED:
+ rd->rd_phase = RD_RECOVERING;
CERROR("recovery prepared for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
if (rc)
goto cb_failed;
- rd->rd_next_phase = RECOVD_RECOVERED;
+ rd->rd_next_phase = RD_RECOVERED;
break;
- case RECOVD_RECOVERED:
- rd->rd_phase = RECOVD_IDLE;
- rd->rd_next_phase = RECOVD_PREPARING;
+ case RD_RECOVERED:
+ rd->rd_phase = RD_IDLE;
+ rd->rd_next_phase = RD_TROUBLED;
CERROR("recovery complete for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
{
char *argv[3];
char *envp[3];
+ int rc;
ENTRY;
conn->c_level = LUSTRE_CONN_RECOVD;
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
envp[2] = NULL;
- RETURN(call_usermodehelper(argv[0], argv, envp));
+ rc = call_usermodehelper(argv[0], argv, envp);
+ if (rc < 0) {
+ CERROR("Error invoking recovery upcall (%s): %d\n",
+ obd_recovery_upcall, rc);
+ CERROR("Check /proc/sys/lustre/recovery_upcall?\n");
+ }
+ RETURN(rc);
}
static int ll_recover_reconnect(struct ptlrpc_connection *conn)
static int ll_retry_recovery(struct ptlrpc_connection *conn)
{
+#if 0
/* XXX use a timer, sideshow bob */
recovd_conn_fail(conn);
+ /* XXX this is disabled until I fix it so that we don't just keep
+ * XXX retrying in the case of a missing upcall.
+ */
+#endif
return 0;
}