From c658f3d1300ceddfba3c2afb2a19898f07c3d2da Mon Sep 17 00:00:00 2001 From: adilger Date: Wed, 1 Jun 2005 08:59:53 +0000 Subject: [PATCH] Branch b1_4 Quiet spurious console error messages when an OST is deactivated on the MDS. Don't NULL-deref pd->pd_set if allocation failed. We will just ping later. b=6346 r=nathan --- lustre/ptlrpc/client.c | 85 +++++++++++++++++++++++--------------------------- lustre/ptlrpc/pinger.c | 16 +++++----- 2 files changed, 48 insertions(+), 53 deletions(-) diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 3e13378..16b6b04 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -97,7 +97,7 @@ static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal desc->bd_md_h = PTL_INVALID_HANDLE; desc->bd_portal = portal; desc->bd_type = type; - + return desc; } @@ -176,7 +176,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc) else class_import_put(desc->bd_import); - OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, + OBD_FREE(desc, offsetof(struct ptlrpc_bulk_desc, bd_iov[desc->bd_max_iov])); EXIT; } @@ -214,13 +214,13 @@ struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode, request->rq_type = PTL_RPC_MSG_REQUEST; request->rq_import = class_import_get(imp); request->rq_export = NULL; - + request->rq_req_cbid.cbid_fn = request_out_callback; request->rq_req_cbid.cbid_arg = request; request->rq_reply_cbid.cbid_fn = reply_in_callback; request->rq_reply_cbid.cbid_arg = request; - + request->rq_phase = RQ_PHASE_NEW; /* XXX FIXME bug 249 */ @@ -345,7 +345,7 @@ void ptlrpc_set_add_new_req(struct ptlrpc_request_set *set, * * The imp->imp_lock must be held. */ -static int ptlrpc_import_delay_req(struct obd_import *imp, +static int ptlrpc_import_delay_req(struct obd_import *imp, struct ptlrpc_request *req, int *status) { int delay = 0; @@ -358,31 +358,24 @@ static int ptlrpc_import_delay_req(struct obd_import *imp, DEBUG_REQ(D_ERROR, req, "Uninitialized import."); *status = -EIO; LBUG(); - } - else if (imp->imp_state == LUSTRE_IMP_CLOSED) { + } else if (imp->imp_state == LUSTRE_IMP_CLOSED) { DEBUG_REQ(D_ERROR, req, "IMP_CLOSED "); *status = -EIO; - } - /* allow CONNECT even if import is invalid */ - else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && + } else if (req->rq_send_state == LUSTRE_IMP_CONNECTING && imp->imp_state == LUSTRE_IMP_CONNECTING) { - ; - } - /* - * If the import has been invalidated (such as by an OST failure), the - * request must fail with -EIO. - */ - else if (imp->imp_invalid) { - DEBUG_REQ(D_ERROR, req, "IMP_INVALID"); + /* allow CONNECT even if import is invalid */ ; + } else if (imp->imp_invalid) { + /* If the import has been invalidated (such as by an OST + * failure), the request must fail with -EIO. */ + if (!imp->imp_deactive) + DEBUG_REQ(D_ERROR, req, "IMP_INVALID"); *status = -EIO; - } - else if (req->rq_import_generation != imp->imp_generation) { + } else if (req->rq_import_generation != imp->imp_generation) { DEBUG_REQ(D_ERROR, req, "req wrong generation:"); *status = -EIO; - } - else if (req->rq_send_state != imp->imp_state) { - if (imp->imp_obd->obd_no_recov || imp->imp_dlm_fake - || req->rq_no_delay) + } else if (req->rq_send_state != imp->imp_state) { + if (imp->imp_obd->obd_no_recov || imp->imp_dlm_fake || + req->rq_no_delay) *status = -EWOULDBLOCK; else delay = 1; @@ -404,10 +397,10 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) DEBUG_REQ(D_NET, req, "REPLIED:"); GOTO(out, rc = 1); } - + if (req->rq_net_err && !req->rq_timedout) { spin_unlock_irqrestore (&req->rq_lock, flags); - rc = ptlrpc_expire_one_request(req); + rc = ptlrpc_expire_one_request(req); spin_lock_irqsave (&req->rq_lock, flags); GOTO(out, rc); } @@ -440,7 +433,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) err = req->rq_repmsg->status; if (req->rq_repmsg->type == PTL_RPC_MSG_ERR) { - DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err == %d", + DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR, err == %d", err); RETURN(err < 0 ? err : -EINVAL); } @@ -474,14 +467,14 @@ static int after_reply(struct ptlrpc_request *req) LASSERT (req->rq_nob_received <= req->rq_replen); rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); if (rc) { - CERROR("unpack_rep failed: %d\n", rc); + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d\n", rc); RETURN(-EPROTO); } if (req->rq_repmsg->type != PTL_RPC_MSG_REPLY && req->rq_repmsg->type != PTL_RPC_MSG_ERR) { - CERROR("invalid packet type received (type=%u)\n", - req->rq_repmsg->type); + DEBUG_REQ(D_ERROR, req, "invalid packet received (type=%u)\n", + req->rq_repmsg->type); RETURN(-EPROTO); } @@ -548,7 +541,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: " "(%s != %s)", - req->rq_reqmsg->status, + req->rq_reqmsg->status, ptlrpc_import_state_name(req->rq_send_state), ptlrpc_import_state_name(imp->imp_state)); LASSERT(list_empty (&req->rq_list)); @@ -625,7 +618,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) GOTO(interpret, req->rq_status); if (req->rq_net_err && !req->rq_timedout) - ptlrpc_expire_one_request(req); + ptlrpc_expire_one_request(req); if (req->rq_err) { ptlrpc_unregister_reply(req); @@ -643,7 +636,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) /* ptlrpc_queue_wait->l_wait_event guarantees that rq_intr * will only be set after rq_timedout, but the oig waiting * path sets rq_intr irrespective of whether ptlrpcd has - * seen a timeout. our policy is to only interpret + * seen a timeout. our policy is to only interpret * interrupted rpcs after they have timed out */ if (req->rq_intr && (req->rq_timedout || req->rq_waiting)) { /* NB could be on delayed list */ @@ -739,7 +732,7 @@ int ptlrpc_check_set(struct ptlrpc_request_set *set) it can be errored if the import is evicted after recovery. */ spin_lock_irqsave (&req->rq_lock, flags); - list_add_tail(&req->rq_list, + list_add_tail(&req->rq_list, &imp->imp_delayed_list); spin_unlock_irqrestore(&req->rq_lock, flags); continue; @@ -869,8 +862,8 @@ int ptlrpc_expired_set(void *data) list_entry(tmp, struct ptlrpc_request, rq_set_chain); /* request in-flight? */ - if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting - && !req->rq_resend) || + if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting && + !req->rq_resend) || (req->rq_phase == RQ_PHASE_BULK))) continue; @@ -1354,8 +1347,8 @@ restart: spin_unlock_irqrestore(&imp->imp_lock, flags); DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%s != %s)", - current->comm, - ptlrpc_import_state_name(req->rq_send_state), + current->comm, + ptlrpc_import_state_name(req->rq_send_state), ptlrpc_import_state_name(imp->imp_state)); lwi = LWI_INTR(interrupted_request, req); rc = l_wait_event(req->rq_reply_waitq, @@ -1363,8 +1356,8 @@ restart: req->rq_err), &lwi); DEBUG_REQ(D_HA, req, "\"%s\" awake: (%s == %s or %d == 1)", - current->comm, - ptlrpc_import_state_name(imp->imp_state), + current->comm, + ptlrpc_import_state_name(imp->imp_state), ptlrpc_import_state_name(req->rq_send_state), req->rq_err); @@ -1487,7 +1480,7 @@ restart: out: if (req->rq_bulk != NULL) { - if (rc >= 0) { + if (rc >= 0) { /* success so far. Note that anything going wrong * with bulk now, is EXTREMELY strange, since the * server must have believed that the bulk @@ -1532,7 +1525,7 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req, unsigned long flags; atomic_dec(&imp->imp_replay_inflight); - + if (!req->rq_replied) { CERROR("request replay timed out, restarting recovery\n"); GOTO(out, rc = -ETIMEDOUT); @@ -1545,12 +1538,12 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req, LASSERT (req->rq_nob_received <= req->rq_replen); rc = lustre_unpack_msg(req->rq_repmsg, req->rq_nob_received); if (rc) { - CERROR("unpack_rep failed: %d\n", rc); + DEBUG_REQ(D_ERROR, req, "unpack_rep failed: %d\n", rc); GOTO(out, rc = -EPROTO); } - if (req->rq_repmsg->type == PTL_RPC_MSG_ERR && - req->rq_repmsg->status == -ENOTCONN) + if (req->rq_repmsg->type == PTL_RPC_MSG_ERR && + req->rq_repmsg->status == -ENOTCONN) GOTO(out, rc = req->rq_repmsg->status); /* The transno had better not change over replay. */ @@ -1578,7 +1571,7 @@ static int ptlrpc_replay_interpret(struct ptlrpc_request *req, rc = ptlrpc_import_recovery_state_machine(imp); out: req->rq_send_state = aa->praa_old_state; - + if (rc != 0) /* this replay failed, so restart recovery */ ptlrpc_connect_import(imp, NULL); diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index ee65bd0..8b75dcf 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -38,7 +38,7 @@ static DECLARE_MUTEX(pinger_sem); static struct list_head pinger_imports = LIST_HEAD_INIT(pinger_imports); -int ptlrpc_ping(struct obd_import *imp) +int ptlrpc_ping(struct obd_import *imp) { struct ptlrpc_request *req; int rc = 0; @@ -51,13 +51,12 @@ int ptlrpc_ping(struct obd_import *imp) imp->imp_obd->obd_uuid.uuid, imp->imp_target_uuid.uuid); req->rq_no_resend = req->rq_no_delay = 1; - req->rq_replen = lustre_msg_size(0, - NULL); + req->rq_replen = lustre_msg_size(0, NULL); ptlrpcd_add_req(req); } else { CERROR("OOM trying to ping %s->%s\n", - imp->imp_obd->obd_uuid.uuid, - imp->imp_target_uuid.uuid); + imp->imp_obd->obd_uuid.uuid, + imp->imp_target_uuid.uuid); rc = -ENOMEM; } @@ -140,7 +139,8 @@ static int ptlrpc_pinger_main(void *arg) obd_timeout * HZ; ptlrpc_initiate_recovery(imp); } else if (level != LUSTRE_IMP_FULL || - imp->imp_obd->obd_no_recov) { + imp->imp_obd->obd_no_recov || + imp->imp_deactive) { CDEBUG(D_HA, "not pinging %s " "(in recovery: %s or recovery " "disabled: %u/%u)\n", @@ -151,7 +151,6 @@ static int ptlrpc_pinger_main(void *arg) } else if (imp->imp_pingable || force) { ptlrpc_ping(imp); } - } else { if (!imp->imp_pingable) continue; @@ -356,6 +355,8 @@ static int pinger_check_rpcs(void *arg) pd->pd_this_ping = curtime; pd->pd_set = ptlrpc_prep_set(); + if (pd->pd_set == NULL) + goto out; set = pd->pd_set; /* add rpcs into set */ @@ -450,6 +451,7 @@ do_check_set: ptlrpc_set_destroy(set); pd->pd_set = NULL; +out: pd->pd_next_ping = pd->pd_this_ping + PING_INTERVAL * HZ; pd->pd_this_ping = 0; /* XXX for debug */ -- 1.8.3.1