X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Fptlrpc%2Frpc.c;h=25d644953d56d2cb361703e27094a427389fbf39;hb=762e5b7708660eb86c614b6ac1c5aaa6bf7acc4c;hp=ec61517263b0e79847964bfb55c4ca3cff64d435;hpb=b437c99b6cd7eee79ad0a16a6f34f693709e5477;p=fs%2Flustre-release.git diff --git a/lustre/ptlrpc/rpc.c b/lustre/ptlrpc/rpc.c index ec61517..25d6449 100644 --- a/lustre/ptlrpc/rpc.c +++ b/lustre/ptlrpc/rpc.c @@ -24,7 +24,12 @@ #define DEBUG_SUBSYSTEM S_RPC #include +#include +#include +#include #include +#include +#include extern int ptlrpc_init_portals(void); extern void ptlrpc_exit_portals(void); @@ -38,46 +43,13 @@ int connmgr_setup(struct obd_device *obddev, obd_count len, void *buf) MOD_INC_USE_COUNT; memset(recovd, 0, sizeof(*recovd)); - OBD_ALLOC(recovd->recovd_client, sizeof(*recovd->recovd_client)); - if (!recovd) - GOTO(err_dec, err = -ENOMEM); - err = recovd_setup(recovd); - if (err) - GOTO(err_free, err); - - recovd->recovd_service = ptlrpc_init_svc(128 * 1024, - CONNMGR_REQUEST_PORTAL, - CONNMGR_REPLY_PORTAL, - "self", connmgr_handle); - if (!recovd->recovd_service) { - CERROR("failed to start service\n"); - GOTO(err_recovd, err = -ENOMEM); - } - - ptlrpc_init_client(NULL, NULL, CONNMGR_REQUEST_PORTAL, - CONNMGR_REPLY_PORTAL, recovd->recovd_client); - recovd->recovd_client->cli_name = "connmgr"; - - err = ptlrpc_start_thread(obddev, recovd->recovd_service, - "lustre_connmgr"); if (err) { - CERROR("cannot start thread\n"); - GOTO(err_svc, err); + MOD_DEC_USE_COUNT; + RETURN(err); } - ptlrpc_connmgr = recovd; RETURN(0); - -err_svc: - ptlrpc_unregister_service(recovd->recovd_service); -err_recovd: - recovd_cleanup(recovd); -err_free: - OBD_FREE(recovd->recovd_client, sizeof(*recovd->recovd_client)); -err_dec: - MOD_DEC_USE_COUNT; - RETURN(err); } int connmgr_cleanup(struct obd_device *dev) @@ -89,39 +61,113 @@ int connmgr_cleanup(struct obd_device *dev) if (err) LBUG(); - ptlrpc_stop_all_threads(recovd->recovd_service); - ptlrpc_unregister_service(recovd->recovd_service); - ptlrpc_cleanup_client(recovd->recovd_client); - OBD_FREE(recovd->recovd_client, sizeof(*recovd->recovd_client)); MOD_DEC_USE_COUNT; RETURN(0); } - -int connmgr_iocontrol(long cmd, struct obd_conn *conn, int len, void *karg, +int connmgr_iocontrol(long cmd, struct lustre_handle *hdl, int len, void *karg, void *uarg) { - struct obd_device *obd = gen_conn2obd(conn); + struct ptlrpc_connection *conn = NULL; + struct obd_device *obd = class_conn2obd(hdl); struct recovd_obd *recovd = &obd->u.recovd; + struct obd_ioctl_data *data = karg; + struct list_head *tmp; + int rc = 0; ENTRY; - if (cmd == OBD_IOC_RECOVD_NEWCONN) { - spin_lock(&recovd->recovd_lock); - recovd->recovd_flags |= RECOVD_UPCALL_ANSWER; - recovd->recovd_wakeup_flag = 1; - wake_up(&recovd->recovd_waitq); + + if (cmd != OBD_IOC_RECOVD_NEWCONN && cmd != OBD_IOC_RECOVD_FAILCONN) + RETURN(-EINVAL); /* XXX ENOSYS? */ + + /* Find the connection that's been rebuilt or has failed. */ + spin_lock(&recovd->recovd_lock); + list_for_each(tmp, &recovd->recovd_troubled_items) { + conn = list_entry(tmp, struct ptlrpc_connection, + c_recovd_data.rd_managed_chain); + + LASSERT(conn->c_recovd_data.rd_recovd == recovd); /* sanity */ + + if (!strcmp(conn->c_remote_uuid, data->ioc_inlbuf1)) + break; + conn = NULL; + } + + if (!conn) { + if (cmd == OBD_IOC_RECOVD_NEWCONN) + GOTO(out, rc = -EINVAL); + /* XXX macroize/inline and share with loop above */ + list_for_each(tmp, &recovd->recovd_managed_items) { + conn = list_entry(tmp, struct ptlrpc_connection, + c_recovd_data.rd_managed_chain); + + LASSERT(conn->c_recovd_data.rd_recovd == recovd); + + if (!strcmp(conn->c_remote_uuid, data->ioc_inlbuf1)) + break; + conn = NULL; + } + if (!conn) + GOTO(out, rc = -EINVAL); + } + + if (cmd == OBD_IOC_RECOVD_FAILCONN) { spin_unlock(&recovd->recovd_lock); - EXIT; + recovd_conn_fail(conn); + spin_lock(&recovd->recovd_lock); + + /* Jump straight to the "failed" phase of recovery. */ + conn->c_recovd_data.rd_phase = RD_FAILED; + goto out; } - return 0; + + + /* else (NEWCONN) */ + spin_lock(&conn->c_lock); + + /* whatever happens, reset the INVALID flag */ + conn->c_flags &= ~CONN_INVALID; + + /* XXX is this a good check? should we allow readdressing of + * XXX conns that aren't in recovery? + */ + if (conn->c_recovd_data.rd_phase != RD_PREPARING) { + spin_unlock(&conn->c_lock); + GOTO(out, rc = -EALREADY); + } + + if (data->ioc_inllen2) { + CERROR("conn %p UUID change %s -> %s\n", + conn, conn->c_remote_uuid, data->ioc_inlbuf2); + strcpy(conn->c_remote_uuid, data->ioc_inlbuf2); + } else { + CERROR("conn %p UUID %s reconnected\n", conn, + conn->c_remote_uuid); + } + ptlrpc_readdress_connection(conn, conn->c_remote_uuid); + spin_unlock(&conn->c_lock); + + conn->c_recovd_data.rd_phase = RD_PREPARED; + wake_up(&recovd->recovd_waitq); + out: + spin_unlock(&recovd->recovd_lock); + RETURN(rc); } +static int connmgr_connect(struct lustre_handle *conn, struct obd_device *src, + obd_uuid_t cluuid, struct recovd_obd *recovd, + ptlrpc_recovery_cb_t recover) +{ + return class_connect(conn, src, cluuid); +} /* use obd ops to offer management infrastructure */ static struct obd_ops recovd_obd_ops = { o_setup: connmgr_setup, o_cleanup: connmgr_cleanup, - o_iocontrol: connmgr_iocontrol, + o_iocontrol: connmgr_iocontrol, + o_connect: connmgr_connect, + o_disconnect: class_disconnect }; static int __init ptlrpc_init(void) @@ -131,27 +177,22 @@ static int __init ptlrpc_init(void) if (rc) RETURN(rc); ptlrpc_init_connection(); - obd_register_type(&recovd_obd_ops, LUSTRE_HA_NAME); + class_register_type(&recovd_obd_ops, LUSTRE_HA_NAME); return 0; } static void __exit ptlrpc_exit(void) { - obd_unregister_type(LUSTRE_HA_NAME); + class_unregister_type(LUSTRE_HA_NAME); ptlrpc_exit_portals(); ptlrpc_cleanup_connection(); } -/* events.c */ -EXPORT_SYMBOL(ptlrpc_check_bulk_sent); - -/* connmgr.c */ -EXPORT_SYMBOL(ptlrpc_connmgr); -EXPORT_SYMBOL(connmgr_connect); -EXPORT_SYMBOL(connmgr_handle); -EXPORT_SYMBOL(recovd_cli_fail); -EXPORT_SYMBOL(recovd_cli_manage); -EXPORT_SYMBOL(recovd_cli_fixed); +/* recovd.c */ +EXPORT_SYMBOL(ptlrpc_recovd); +EXPORT_SYMBOL(recovd_conn_fail); +EXPORT_SYMBOL(recovd_conn_manage); +EXPORT_SYMBOL(recovd_conn_fixed); EXPORT_SYMBOL(recovd_setup); EXPORT_SYMBOL(recovd_cleanup); @@ -184,7 +225,6 @@ EXPORT_SYMBOL(ptlrpc_replay_req); EXPORT_SYMBOL(ptlrpc_restart_req); EXPORT_SYMBOL(ptlrpc_prep_req); EXPORT_SYMBOL(ptlrpc_free_req); -EXPORT_SYMBOL(ptlrpc_prep_req2); EXPORT_SYMBOL(ptlrpc_req_finished); EXPORT_SYMBOL(ptlrpc_prep_bulk); EXPORT_SYMBOL(ptlrpc_free_bulk); @@ -204,9 +244,13 @@ EXPORT_SYMBOL(lustre_msg_size); EXPORT_SYMBOL(lustre_unpack_msg); EXPORT_SYMBOL(lustre_msg_buf); -MODULE_AUTHOR("Peter J. Braam "); +/* recover.c */ +EXPORT_SYMBOL(ptlrpc_run_recovery_upcall); +EXPORT_SYMBOL(ptlrpc_reconnect_and_replay); + +MODULE_AUTHOR("Cluster File Systems, Inc "); MODULE_DESCRIPTION("Lustre Request Processor v1.0"); -MODULE_LICENSE("GPL"); +MODULE_LICENSE("GPL"); module_init(ptlrpc_init); module_exit(ptlrpc_exit);