level script context and will exit with the associated exit
status, but doesn't ensure that this exit status is non-zero.
+Severity : minor
+Frequency : rare
+Bugzilla : 9493
+Description: failure of ptlrpc thread startup can cause oops
+Details : Starting a ptlrpc service thread can fail if there are a large
+ number of threads or the server memory is very fragmented.
+ Handle this without oopsing.
+
------------------------------------------------------------------------------
08-26-2005 Cluster File Systems, Inc. <info@clusterfs.com>
RETURN(0);
}
+/* this sends any unsent RPCs in @set and returns TRUE if all are sent */
int ptlrpc_check_set(struct ptlrpc_request_set *set)
{
unsigned long flags;
imp->imp_conn_cnt++;
imp->imp_resend_replay = 0;
- if (!lustre_handle_is_used(&imp->imp_remote_handle)) {
+ if (!lustre_handle_is_used(&imp->imp_remote_handle))
initial_connect = 1;
- } else {
+ else
committed_before_reconnect = imp->imp_peer_committed_transno;
- }
spin_unlock_irqrestore(&imp->imp_lock, flags);
#else
char *action = "recompiling this application";
#endif
-
+
CWARN("Server %s version (%d.%d.%d.%d) is much newer. "
"Consider %s (%s).\n",
imp->imp_target_uuid.uuid,
out:
if (rc != 0) {
-
IMPORT_SET_STATE(imp, LUSTRE_IMP_DISCON);
- if (aa->pcaa_initial_connect && !imp->imp_initial_recov) {
+ if (aa->pcaa_initial_connect && !imp->imp_initial_recov)
ptlrpc_deactivate_import(imp);
- }
if (rc == -EPROTO) {
struct obd_connect_data *ocd;
ocd = lustre_swab_repbuf(request, 0,
sizeof *ocd,
lustre_swab_connect);
- if (ocd &&
- (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
+ if (ocd &&
+ (ocd->ocd_connect_flags & OBD_CONNECT_VERSION) &&
(ocd->ocd_version != LUSTRE_VERSION_CODE)) {
/* Actually servers are only supposed to refuse
connection from liblustre clients, so we should
never see this from VFS context */
- CERROR("Server %s version (%d.%d.%d.%d) refused"
- " connection from this client as too old "
- "version (%s). Client must be "
- "recompiled\n",
+ CERROR("Server %s version (%d.%d.%d.%d) "
+ "refused connection from this client "
+ "as too old version (%s). Client must "
+ "be recompiled\n",
imp->imp_target_uuid.uuid,
OBD_OCD_VERSION_MAJOR(ocd->ocd_version),
OBD_OCD_VERSION_MINOR(ocd->ocd_version),
OBD_OCD_VERSION_FIX(ocd->ocd_version),
LUSTRE_VERSION_STRING);
IMPORT_SET_STATE(imp, LUSTRE_IMP_CLOSED);
- RETURN(-EPROTO);
}
+ RETURN(-EPROTO);
}
-
+
ptlrpc_maybe_ping_import_soon(imp);
CDEBUG(D_HA, "recovery of %s on %s failed (%d)\n",
/* And now, loop forever, pinging as needed. */
while (1) {
unsigned long this_ping = jiffies;
- long time_to_next_ping;
+ long time_to_next_ping = 0;
struct l_wait_info lwi;
struct list_head *iter;
/* The ping sent by ptlrpc_send_rpc may get sent out
say .01 second after this.
- ptlrpc_pinger_sending_on_import will then set the
+ ptlrpc_pinger_eending_on_import will then set the
next ping time to next_ping + .01 sec, which means
we will SKIP the next ping at next_ping, and the
ping will get sent 2 timeouts from now! Beware. */
if (rc < 0) {
CERROR("cannot start thread: %d\n", rc);
OBD_FREE(pinger_thread, sizeof(*pinger_thread));
+ pinger_thread = NULL;
RETURN(rc);
}
l_wait_event(pinger_thread->t_ctl_waitq,
pinger_thread->t_flags & SVC_RUNNING, &lwi);
- RETURN(rc);
+ RETURN(0);
}
int ptlrpc_stop_pinger(void)
__init int ptlrpc_init(void)
{
- int rc;
+ int rc, cleanup_phase = 0;
ENTRY;
lustre_assert_wire_constants();
rc = ptlrpc_init_portals();
if (rc)
RETURN(rc);
+ cleanup_phase = 1;
ptlrpc_init_connection();
- llog_init_commit_master();
+ rc = llog_init_commit_master();
+ if (rc)
+ GOTO(cleanup, rc);
+ cleanup_phase = 2;
ptlrpc_put_connection_superhack = ptlrpc_put_connection;
ptlrpc_abort_inflight_superhack = ptlrpc_abort_inflight;
- ptlrpc_start_pinger();
- ldlm_init();
+ rc = ptlrpc_start_pinger();
+ if (rc)
+ GOTO(cleanup, rc);
+ cleanup_phase = 3;
+
+ rc = ldlm_init();
+ if (rc)
+ GOTO(cleanup, rc);
RETURN(0);
+
+cleanup:
+ switch(cleanup_phase) {
+ case 3:
+ ptlrpc_stop_pinger();
+ case 2:
+ llog_cleanup_commit_master(1);
+ ptlrpc_cleanup_connection();
+ case 1:
+ ptlrpc_exit_portals();
+ default: ;
+ }
+
+ return rc;
}
#ifdef __KERNEL__
wake_up(&pc->pc_waitq);
}
+/* requests that are added to the ptlrpcd queue are sent via
+ * ptlrpcd_check->ptlrpc_check_set() */
void ptlrpcd_add_req(struct ptlrpc_request *req)
{
struct ptlrpcd_ctl *pc;
static int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc)
{
- int rc = 0;
+ int rc;
memset(pc, 0, sizeof(*pc));
init_completion(&pc->pc_starting);
pc->pc_set = ptlrpc_prep_set();
if (pc->pc_set == NULL)
- GOTO(out, rc = -ENOMEM);
+ RETURN(-ENOMEM);
#ifdef __KERNEL__
- if (kernel_thread(ptlrpcd, pc, 0) < 0) {
+ rc = kernel_thread(ptlrpcd, pc, 0);
+ if (rc < 0) {
ptlrpc_set_destroy(pc->pc_set);
- GOTO(out, rc = -ECHILD);
+ RETURN(rc);
}
wait_for_completion(&pc->pc_starting);
#else
pc->pc_callback =
liblustre_register_wait_callback(&ptlrpcd_check_async_rpcs, pc);
+ (void)rc;
#endif
-out:
- RETURN(rc);
+ RETURN(0);
}
static void ptlrpcd_stop(struct ptlrpcd_ctl *pc)