*/
#define DEBUG_SUBSYSTEM S_RPC
-
+#ifndef __KERNEL__
+#include <liblustre.h>
+#include <linux/obd.h>
+#include <linux/obd_class.h>
+#else
#include <linux/lustre_lite.h>
+#endif
+
#include <linux/lustre_ha.h>
#include <linux/obd_support.h>
/* dump_connection_list, but shorter for nicer debugging logs */
static void d_c_l(struct list_head *head)
{
- int sanity = 0;
struct list_head *tmp;
list_for_each(tmp, head) {
struct ptlrpc_connection *conn =
list_entry(tmp, struct ptlrpc_connection,
c_recovd_data.rd_managed_chain);
- CDEBUG(D_HA, " %p = %s (%d/%d)\n", conn, conn->c_remote_uuid,
+ CDEBUG(D_HA, " %p = %s (%d/%d)\n", conn,
+ conn->c_remote_uuid.uuid,
conn->c_recovd_data.rd_phase,
conn->c_recovd_data.rd_next_phase);
- if (sanity++ > 50)
- LBUG();
}
}
if (!list_empty(&rd->rd_managed_chain)) {
if (rd->rd_recovd == recovd && rd->rd_recover == recover) {
CDEBUG(D_HA, "conn %p/%s already setup for recovery\n",
- conn, conn->c_remote_uuid);
+ conn, conn->c_remote_uuid.uuid);
EXIT;
return;
}
CDEBUG(D_HA,
"conn %p/%s has recovery items %p/%p, making %p/%p\n",
- conn, conn->c_remote_uuid, rd->rd_recovd, rd->rd_recover,
+ conn, conn->c_remote_uuid.uuid, rd->rd_recovd, rd->rd_recover,
recovd, recover);
spin_lock(&rd->rd_recovd->recovd_lock);
- list_del(&rd->rd_managed_chain);
+ list_del_init(&rd->rd_managed_chain);
spin_unlock(&rd->rd_recovd->recovd_lock);
}
EXIT;
}
+void recovd_conn_unmanage(struct ptlrpc_connection *conn)
+{
+ struct recovd_data *rd = &conn->c_recovd_data;
+ struct recovd_obd *recovd = rd->rd_recovd;
+ ENTRY;
+
+ if (recovd) {
+ spin_lock(&recovd->recovd_lock);
+ list_del_init(&rd->rd_managed_chain);
+ rd->rd_recovd = NULL;
+ spin_unlock(&recovd->recovd_lock);
+ }
+ /* should be safe enough, right? */
+ rd->rd_recover = NULL;
+ rd->rd_next_phase = RD_IDLE;
+ rd->rd_next_phase = RD_TROUBLED;
+}
+
void recovd_conn_fail(struct ptlrpc_connection *conn)
{
struct recovd_data *rd = &conn->c_recovd_data;
}
spin_lock(&recovd->recovd_lock);
- if (rd->rd_phase != RD_IDLE) {
- CERROR("connection %p to %s already in recovery\n",
- conn, conn->c_remote_uuid);
- /* XXX need to distinguish from failure-in-recovery */
+ if (rd->rd_phase == RD_TROUBLED || rd->rd_phase == RD_PREPARING) {
+ CDEBUG(D_HA, "connection %p to %s already in recovery\n",
+ conn, conn->c_remote_uuid.uuid);
spin_unlock(&recovd->recovd_lock);
EXIT;
return;
}
-
- CERROR("connection %p to %s failed\n", conn, conn->c_remote_uuid);
+
+ CERROR("connection %p to %s nid "LPX64" on %s failed\n", conn,
+ conn->c_remote_uuid.uuid, conn->c_peer.peer_nid,
+ conn->c_peer.peer_ni->pni_name);
list_del(&rd->rd_managed_chain);
list_add_tail(&rd->rd_managed_chain, &recovd->recovd_troubled_items);
+ if (rd->rd_phase != RD_IDLE) {
+ CDEBUG(D_HA,
+ "connection %p to %s failed in recovery: restarting\n",
+ conn, conn->c_remote_uuid.uuid);
+ /* XXX call callback with PHASE_FAILED? */
+ rd->rd_next_phase = RD_TROUBLED;
+ }
rd->rd_phase = RD_TROUBLED;
dump_lists(recovd);
spin_unlock(&recovd->recovd_lock);
ENTRY;
CDEBUG(D_HA, "connection %p (now to %s) fixed\n",
- conn, conn->c_remote_uuid);
+ conn, conn->c_remote_uuid.uuid);
spin_lock(&rd->rd_recovd->recovd_lock);
list_del(&rd->rd_managed_chain);
rd->rd_phase = RD_IDLE;
EXIT;
}
-
static int recovd_check_event(struct recovd_obd *recovd)
{
int rc = 0;
cb_failed: /* must always reach here with recovd_lock held! */
CERROR("recovery FAILED for rd %p (conn %p): %d\n",
rd, class_rd2conn(rd), rc);
-
+
spin_unlock(&recovd->recovd_lock);
(void)rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_FAILURE);
spin_lock(&recovd->recovd_lock);
break;
-
+
case RD_TROUBLED:
if (!rd->rd_recover) {
CERROR("no rd_recover for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
rd->rd_phase = RD_PREPARING;
rd->rd_next_phase = RD_PREPARED;
-
+
spin_unlock(&recovd->recovd_lock);
rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_PREPARE);
spin_lock(&recovd->recovd_lock);
if (rc)
goto cb_failed;
-
+
break;
-
+
case RD_PREPARED:
-
+
CERROR("recovery prepared for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
rd->rd_phase = RD_RECOVERING;
rd->rd_next_phase = RD_RECOVERED;
-
+
spin_unlock(&recovd->recovd_lock);
rc = rd->rd_recover(rd, PTLRPC_RECOVD_PHASE_RECOVER);
spin_lock(&recovd->recovd_lock);
if (rc)
goto cb_failed;
-
+
break;
-
+
case RD_RECOVERED:
rd->rd_phase = RD_IDLE;
rd->rd_next_phase = RD_TROUBLED;
-
+
CERROR("recovery complete for rd %p (conn %p)\n",
rd, class_rd2conn(rd));
break;
-
+
default:
break;
}
RETURN(0);
}
+#ifdef __KERNEL__
static int recovd_main(void *arg)
{
struct recovd_obd *recovd = (struct recovd_obd *)arg;
-
+ unsigned long flags;
ENTRY;
lock_kernel();
daemonize();
- spin_lock_irq(¤t->sigmask_lock);
+
+#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)
+ sigfillset(¤t->blocked);
+ recalc_sigpending();
+#else
+ spin_lock_irqsave(¤t->sigmask_lock, flags);
sigfillset(¤t->blocked);
recalc_sigpending(current);
- spin_unlock_irq(¤t->sigmask_lock);
+ spin_unlock_irqrestore(¤t->sigmask_lock, flags);
+#endif
sprintf(current->comm, "lustre_recovd");
unlock_kernel();
int recovd_setup(struct recovd_obd *recovd)
{
- int rc;
+ int rc = 0; /* initialize for Liblustre */
ENTRY;
RETURN(0);
}
+#else
+int recovd_setup(struct recovd_obd *recovd)
+{
+ return 0;
+}
+#endif
int recovd_cleanup(struct recovd_obd *recovd)
{