Whamcloud - gitweb
b=13813
authornathan <nathan>
Mon, 5 Nov 2007 19:30:38 +0000 (19:30 +0000)
committernathan <nathan>
Mon, 5 Nov 2007 19:30:38 +0000 (19:30 +0000)
i=adilger
i=johann
Need to set the AT timeout estimate after the req portal is set.

12 files changed:
lustre/include/lustre_net.h
lustre/ldlm/ldlm_request.c
lustre/mdc/mdc_reint.c
lustre/mdc/mdc_request.c
lustre/osc/osc_create.c
lustre/osc/osc_request.c
lustre/ost/ost_handler.c
lustre/ptlrpc/client.c
lustre/ptlrpc/pack_generic.c
lustre/ptlrpc/ptlrpc_module.c
lustre/ptlrpc/recov_thread.c
lustre/ptlrpc/service.c

index 5c8d859..6fbde89 100644 (file)
@@ -694,6 +694,7 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool);
 void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq);
 struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int, int,
                                                 void (*populate_pool)(struct ptlrpc_request_pool *, int));
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req);
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, __u32 version,
                                        int opcode, int count, int *lengths,
                                        char **bufs);
index fa8a5cd..8b5c280 100644 (file)
@@ -918,9 +918,9 @@ int ldlm_cli_cancel_req(struct obd_export *exp,
                 req->rq_no_resend = 1;
                 req->rq_no_delay = 1;
 
-                /* XXX FIXME bug 249 */
                 req->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                 req->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+                ptlrpc_at_set_req_timeout(req);
 
                 body = lustre_msg_buf(req->rq_reqmsg, DLM_LOCKREQ_OFF,
                                       sizeof(*body));
index e983e85..37f3a29 100644 (file)
@@ -136,7 +136,8 @@ int mdc_setattr(struct obd_export *exp, struct mdc_op_data *op_data,
                 RETURN(-ENOMEM);
 
         if (iattr->ia_valid & ATTR_FROM_OPEN) {
-                req->rq_request_portal = MDS_SETATTR_PORTAL; //XXX FIXME bug 249
+                req->rq_request_portal = MDS_SETATTR_PORTAL;
+                ptlrpc_at_set_req_timeout(req);
                 rpc_lock = obd->u.cli.cl_setattr_lock;
         } else {
                 rpc_lock = obd->u.cli.cl_rpc_lock;
index fc3358e..7b6079d 100644 (file)
@@ -678,8 +678,8 @@ int mdc_close(struct obd_export *exp, struct obdo *oa,
         /* To avoid a livelock (bug 7034), we need to send CLOSE RPCs to a
          * portal whose threads are not taking any DLM locks and are therefore
          * always progressing */
-        /* XXX FIXME bug 249 */
         req->rq_request_portal = MDS_READPAGE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
 
         /* Ensure that this close's handle is fixed up during replay. */
         LASSERT(och != NULL);
@@ -787,8 +787,8 @@ int mdc_readpage(struct obd_export *exp, struct ll_fid *fid, __u64 offset,
         if (req == NULL)
                 GOTO(out, rc = -ENOMEM);
 
-        /* XXX FIXME bug 249 */
         req->rq_request_portal = MDS_READPAGE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
 
         desc = ptlrpc_prep_bulk_imp(req, 1, BULK_PUT_SINK, MDS_BULK_PORTAL);
         if (desc == NULL)
index 90e0019..71c8986 100644 (file)
@@ -160,7 +160,8 @@ static int oscc_internal_create(struct osc_creator *oscc)
                 RETURN(-ENOMEM);
         }
 
-        request->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+        request->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(request);
         body = lustre_msg_buf(request->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
 
         spin_lock(&oscc->oscc_lock);
index 9e6fec3..42b877d 100644 (file)
@@ -478,6 +478,7 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo,
                 RETURN(-ENOMEM);
 
         req->rq_request_portal = OST_IO_PORTAL;         /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
 
         body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
         memcpy(&body->oa, oinfo->oi_oa, sizeof(*oinfo->oi_oa));
@@ -610,6 +611,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa,
                 RETURN(-ENOMEM);
 
         req->rq_request_portal = OST_IO_PORTAL;         /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
 
         body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body));
 
@@ -942,6 +944,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa,
                 RETURN (-ENOMEM);
 
         req->rq_request_portal = OST_IO_PORTAL;         /* bug 7198 */
+        ptlrpc_at_set_req_timeout(req);
 
         if (opc == OST_WRITE)
                 desc = ptlrpc_prep_bulk_imp (req, page_count,
@@ -3048,7 +3051,8 @@ static int osc_statfs_async(struct obd_device *obd, struct obd_info *oinfo,
                 RETURN(-ENOMEM);
 
         ptlrpc_req_set_repsize(req, 2, size);
-        req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+        req->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
 
         req->rq_interpret_reply = osc_statfs_interpret;
         CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args));
@@ -3079,7 +3083,8 @@ static int osc_statfs(struct obd_device *obd, struct obd_statfs *osfs,
                 RETURN(-ENOMEM);
 
         ptlrpc_req_set_repsize(req, 2, size);
-        req->rq_request_portal = OST_CREATE_PORTAL; //XXX FIXME bug 249
+        req->rq_request_portal = OST_CREATE_PORTAL;
+        ptlrpc_at_set_req_timeout(req);
 
         rc = ptlrpc_queue_wait(req);
         if (rc)
index 056ed0d..69e12f9 100644 (file)
@@ -812,6 +812,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                 else
                         rc = ptlrpc_start_bulk_transfer(desc);
                 if (rc == 0) {
+                        time_t start = cfs_time_current_sec();
                         do {
                                 long timeoutl = req->rq_deadline - 
                                         cfs_time_current_sec();
@@ -830,7 +831,11 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti)
                                  (req->rq_deadline > cfs_time_current_sec()));
 
                         if (rc == -ETIMEDOUT) {
-                                DEBUG_REQ(D_ERROR, req, "timeout on bulk PUT");
+                                DEBUG_REQ(D_ERROR, req,
+                                          "timeout on bulk PUT after %ld%+lds",
+                                          req->rq_deadline - start,
+                                          cfs_time_current_sec() -
+                                          req->rq_deadline);
                                 ptlrpc_abort_bulk(desc);
                         } else if (exp->exp_failed) {
                                 DEBUG_REQ(D_ERROR, req, "Eviction on bulk PUT");
@@ -1063,6 +1068,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
         else
                 rc = ptlrpc_start_bulk_transfer (desc);
         if (rc == 0) {
+                time_t start = cfs_time_current_sec();
                 do {
                         long timeoutl = req->rq_deadline - 
                                 cfs_time_current_sec();
@@ -1079,7 +1085,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti)
                          (req->rq_deadline > cfs_time_current_sec()));
 
                 if (rc == -ETIMEDOUT) {
-                        DEBUG_REQ(D_ERROR, req, "timeout on bulk GET");
+                        DEBUG_REQ(D_ERROR, req,
+                                  "timeout on bulk GET after %ld%+lds",
+                                  req->rq_deadline - start,
+                                  cfs_time_current_sec() -
+                                  req->rq_deadline);
                         ptlrpc_abort_bulk(desc);
                 } else if (desc->bd_export->exp_failed) {
                         DEBUG_REQ(D_ERROR, req, "Eviction on bulk GET");
index 2d30542..6fdc64b 100644 (file)
@@ -190,7 +190,7 @@ void ptlrpc_free_bulk(struct ptlrpc_bulk_desc *desc)
 }
 
 /* Set server timelimit for this req */
-static void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req) 
+void ptlrpc_at_set_req_timeout(struct ptlrpc_request *req)
 {
         __u32 serv_est;
         int idx;
@@ -532,7 +532,6 @@ ptlrpc_prep_req_pool(struct obd_import *imp, __u32 version, int opcode,
 
         request->rq_phase = RQ_PHASE_NEW;
 
-        /* XXX FIXME bug 249 */
         request->rq_request_portal = imp->imp_client->cli_request_portal;
         request->rq_reply_portal = imp->imp_client->cli_reply_portal;
         
index b7ce82b..dc31d08 100644 (file)
@@ -2397,9 +2397,9 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask,
         va_start(args, fmt);
         libcfs_debug_vmsg2(data->msg_cdls, data->msg_subsys, mask, data->msg_file,
                            data->msg_fn, data->msg_line, fmt, args,
-                           " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d lens %d/%d "
-                           "e %d to %d dl %ld ref %d fl "REQ_FLAGS_FMT"/%x/%x "
-                           "rc %d/%d\n",
+                           " req@%p x"LPD64"/t"LPD64" o%d->%s@%s:%d/%d "
+                           "lens %d/%d e %d to %d dl %ld ref %d "
+                           "fl "REQ_FLAGS_FMT"/%x/%x rc %d/%d\n",
                            req, req->rq_xid, req->rq_transno,
                            req->rq_reqmsg ? lustre_msg_get_opc(req->rq_reqmsg) : -1,
                            req->rq_import ? obd2cli_tgt(req->rq_import->imp_obd) :
@@ -2409,8 +2409,7 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask,
                                 (char *)req->rq_import->imp_connection->c_remote_uuid.uuid :
                            req->rq_export ?
                                 (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "<?>",
-                           (req->rq_import && req->rq_import->imp_client) ?
-                                req->rq_import->imp_client->cli_request_portal : -1,
+                           req->rq_request_portal,  req->rq_reply_portal,
                            req->rq_reqlen, req->rq_replen,
                            req->rq_early_count, req->rq_timeout, req->rq_deadline,
                            atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req),
index 4a46cb7..56e7228 100644 (file)
@@ -145,6 +145,7 @@ EXPORT_SYMBOL(ptlrpc_add_rqs_to_pool);
 EXPORT_SYMBOL(ptlrpc_init_rq_pool);
 EXPORT_SYMBOL(ptlrpc_free_rq_pool);
 EXPORT_SYMBOL(ptlrpc_prep_req_pool);
+EXPORT_SYMBOL(ptlrpc_at_set_req_timeout);
 EXPORT_SYMBOL(ptlrpc_prep_req);
 EXPORT_SYMBOL(ptlrpc_free_req);
 EXPORT_SYMBOL(ptlrpc_unregister_reply);
index ea915c6..9ec7074 100644 (file)
@@ -370,9 +370,10 @@ static int log_commit_thread(void *arg)
                                 break;
                         }
 
-                        /* XXX FIXME bug 249, 5515 */
+                        /* bug 5515 */
                         request->rq_request_portal = LDLM_CANCEL_REQUEST_PORTAL;
                         request->rq_reply_portal = LDLM_CANCEL_REPLY_PORTAL;
+                        ptlrpc_at_set_req_timeout(request);
 
                         ptlrpc_req_set_repsize(request, 1, NULL);
                         mutex_down(&llcd->llcd_ctxt->loc_sem);
index e05bdf6..287076e 100644 (file)
@@ -49,7 +49,7 @@ unsigned int at_history = 600;
 CFS_MODULE_PARM(at_history, "i", int, 0644,
                 "Adaptive timeouts remember the slowest event that took place "
                 "within this period (sec)");
-static int at_early_margin = 3;
+static int at_early_margin = 5;
 CFS_MODULE_PARM(at_early_margin, "i", int, 0644,
                 "How soon before an RPC deadline to send an early reply");
 static int at_extra = 30;
@@ -662,8 +662,9 @@ static int ptlrpc_at_send_early_reply(struct ptlrpc_request *req,
                 RETURN(0);
         
         if (olddl < 0) {
-                CDEBUG(D_ADAPTTO, "x"LPU64": Already past deadline (%+lds), not"
-                       " sending early reply\n", req->rq_xid, olddl);
+                CDEBUG(D_WARNING, "x"LPU64": Already past deadline (%+lds), not"
+                       " sending early reply. Increase at_early_margin (%d)?\n",
+                       req->rq_xid, olddl, at_early_margin);
                 /* Return an error so we're not re-added to the timed list. */
                 RETURN(-ETIMEDOUT);
         }
@@ -996,11 +997,12 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc,
         /* Discard requests queued for longer than the deadline.  
            The deadline is increased if we send an early reply. */
         if (cfs_time_current_sec() > request->rq_deadline) {
-                CERROR("Dropping timed-out opc %d request from %s"
-                       ": deadline %lds ago\n",
-                       lustre_msg_get_opc(request->rq_reqmsg),
-                       libcfs_id2str(request->rq_peer),
-                       cfs_time_current_sec() - request->rq_deadline);
+                DEBUG_REQ(D_ERROR, request, "Dropping timed-out request from %s"
+                          ": deadline %ld%+lds ago\n",
+                          libcfs_id2str(request->rq_peer),
+                          request->rq_deadline -
+                          request->rq_arrival_time.tv_sec,
+                          cfs_time_current_sec() - request->rq_deadline);
                 goto put_rpc_export;
         }
 
@@ -1042,8 +1044,9 @@ put_conn:
 
         if (cfs_time_current_sec() > request->rq_deadline) {
                 DEBUG_REQ(D_WARNING, request, "Request x"LPU64" took longer "
-                          "than estimated (%+lds); client may timeout.",
-                          request->rq_xid,
+                          "than estimated (%ld%+lds); client may timeout.",
+                          request->rq_xid, request->rq_deadline -
+                          request->rq_arrival_time.tv_sec,
                           cfs_time_current_sec() - request->rq_deadline);
         }