return NULL;
}
- c = ptlrpc_get_connection(peer, self, uuid);
+ c = ptlrpc_connection_get(peer, self, uuid);
if (c) {
memcpy(c->c_remote_uuid.uuid,
uuid->uuid, sizeof(c->c_remote_uuid.uuid));
return c;
}
-void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,
- struct obd_uuid *uuid)
-{
- lnet_nid_t self;
- lnet_process_id_t peer;
- int err;
-
- err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
- if (err != 0) {
- CERROR("cannot find peer %s!\n", uuid->uuid);
- return;
- }
-
- conn->c_peer = peer;
- conn->c_self = self;
- return;
-}
-
static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal)
{
struct ptlrpc_bulk_desc *desc;
cfs_waitq_init(&desc->bd_waitq);
desc->bd_max_iov = npages;
desc->bd_iov_count = 0;
- desc->bd_md_h = LNET_INVALID_HANDLE;
+ LNetInvalidateHandle(&desc->bd_md_h);
desc->bd_portal = portal;
desc->bd_type = type;
idx = import_at_get_index(req->rq_import,
req->rq_request_portal);
serv_est = at_get(&at->iat_service_estimate[idx]);
- /* add an arbitrary minimum: 125% +5 sec */
- req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+ req->rq_timeout = at_est2timeout(serv_est);
/* We could get even fancier here, using history to predict increased
loading... */
}
/* Adjust max service estimate based on server value */
-static void ptlrpc_at_adj_service(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+ unsigned int serv_est)
{
int idx;
- unsigned int serv_est, oldse;
- struct imp_at *at = &req->rq_import->imp_at;
+ unsigned int oldse;
+ struct imp_at *at;
- LASSERT(req->rq_import);
+ /* do estimate only if is not in recovery */
+ if (!(req->rq_send_state & (LUSTRE_IMP_FULL | LUSTRE_IMP_CONNECTING)))
+ return;
- /* service estimate is returned in the repmsg timeout field,
- may be 0 on err */
- serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+ LASSERT(req->rq_import);
+ at = &req->rq_import->imp_at;
idx = import_at_get_index(req->rq_import, req->rq_request_portal);
/* max service estimates are tracked on the server side,
}
/* Adjust expected network latency */
-static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+ unsigned int service_time)
{
- unsigned int st, nl, oldnl;
- struct imp_at *at = &req->rq_import->imp_at;
+ unsigned int nl, oldnl;
+ struct imp_at *at;
time_t now = cfs_time_current_sec();
LASSERT(req->rq_import);
-
- st = lustre_msg_get_service_time(req->rq_repmsg);
+ at = &req->rq_import->imp_at;
/* Network latency is total time less server processing time */
- nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
- if (st > now - req->rq_sent + 2 /* rounding */)
- CERROR("Reported service time %u > total measured time "
- CFS_DURATION_T"\n",
- st, cfs_time_sub(now, req->rq_sent));
+ nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+ if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+ CWARN("Reported service time %u > total measured time "
+ CFS_DURATION_T"\n", service_time,
+ cfs_time_sub(now, req->rq_sent));
oldnl = at_add(&at->iat_net_latency, nl);
if (oldnl != 0)
* Handle an early reply message, called with the rq_lock held.
* If anything goes wrong just ignore it - same as if it never happened
*/
-static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
- time_t olddl;
- int rc;
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_request *early_req;
+ time_t olddl;
+ int rc;
ENTRY;
req->rq_early = 0;
spin_unlock(&req->rq_lock);
- rc = sptlrpc_cli_unwrap_early_reply(req);
- if (rc)
- GOTO(out, rc);
-
- rc = unpack_reply(req);
- if (rc)
- GOTO(out_cleanup, rc);
+ rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+ if (rc) {
+ spin_lock(&req->rq_lock);
+ RETURN(rc);
+ }
- /* Expecting to increase the service time estimate here */
- ptlrpc_at_adj_service(req);
- ptlrpc_at_adj_net_latency(req);
+ rc = unpack_reply(early_req);
+ if (rc == 0) {
+ /* Expecting to increase the service time estimate here */
+ ptlrpc_at_adj_service(req,
+ lustre_msg_get_timeout(early_req->rq_repmsg));
+ ptlrpc_at_adj_net_latency(req,
+ lustre_msg_get_service_time(early_req->rq_repmsg));
+ }
- /* Adjust the local timeout for this req */
- ptlrpc_at_set_req_timeout(req);
+ sptlrpc_cli_finish_early_reply(early_req);
- olddl = req->rq_deadline;
- /* server assumes it now has rq_timeout from when it sent the
- early reply, so client should give it at least that long. */
- req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
- ptlrpc_at_get_net_latency(req);
-
- DEBUG_REQ(D_ADAPTTO, req,
- "Early reply #%d, new deadline in "CFS_DURATION_T"s ("
- CFS_DURATION_T"s)", req->rq_early_count,
- cfs_time_sub(req->rq_deadline, cfs_time_current_sec()),
- cfs_time_sub(req->rq_deadline, olddl));
-
-out_cleanup:
- sptlrpc_cli_finish_early_reply(req);
-out:
spin_lock(&req->rq_lock);
+
+ if (rc == 0) {
+ /* Adjust the local timeout for this req */
+ ptlrpc_at_set_req_timeout(req);
+
+ olddl = req->rq_deadline;
+ /* server assumes it now has rq_timeout from when it sent the
+ early reply, so client should give it at least that long. */
+ req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+ ptlrpc_at_get_net_latency(req);
+
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+ "("CFS_DURATION_T"s)", req->rq_early_count,
+ cfs_time_sub(req->rq_deadline,
+ cfs_time_current_sec()),
+ cfs_time_sub(req->rq_deadline, olddl));
+ }
+
RETURN(rc);
}
OBD_ALLOC_PTR(request);
if (request) {
- LASSERT((unsigned long)imp > 0x1000);
+ LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
LASSERT(imp != LP_POISON);
- LASSERT((unsigned long)imp->imp_client > 0x1000);
+ LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+ imp->imp_client);
LASSERT(imp->imp_client != LP_POISON);
request->rq_import = class_import_get(imp);
if (req->rq_phase == RQ_PHASE_NEW) {
if (req->rq_interpret_reply != NULL) {
- int (*interpreter)(struct ptlrpc_request *,
- void *, int) =
+ ptlrpc_interpterer_t interpreter =
req->rq_interpret_reply;
/* higher level (i.e. LOV) failed;
* let the sub reqs clean up */
req->rq_status = -EBADR;
- interpreter(req, &req->rq_async_args,
+ interpreter(NULL, req, &req->rq_async_args,
req->rq_status);
}
set->set_remaining--;
}
OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
- ptlrpc_at_adj_service(req);
- ptlrpc_at_adj_net_latency(req);
+ ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+ ptlrpc_at_adj_net_latency(req,
+ lustre_msg_get_service_time(req->rq_repmsg));
rc = ptlrpc_check_status(req);
imp->imp_connect_error = rc;
req->rq_status = rc;
RETURN(1);
} else {
- /* here begins timeout counting */
- req->rq_sent = cfs_time_current_sec();
req->rq_wait_ctx = 1;
RETURN(0);
}
}
/* this sends any unsent RPCs in @set and returns TRUE if all are sent */
-int ptlrpc_check_set(struct ptlrpc_request_set *set)
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
{
struct list_head *tmp;
int force_timer_recalc = 0;
req->rq_waiting || req->rq_wait_ctx) {
int status;
- /* rq_wait_ctx is only touched in ptlrpcd,
- * no lock needed here.
- */
- if (req->rq_wait_ctx)
- goto check_ctx;
-
ptlrpc_unregister_reply(req);
spin_lock(&imp->imp_lock);
spin_unlock(&imp->imp_lock);
req->rq_waiting = 0;
- if (req->rq_resend) {
+
+ if (req->rq_timedout||req->rq_resend) {
+ /* This is re-sending anyways,
+ * let's mark req as resend. */
+ req->rq_resend = 1;
lustre_msg_add_flags(req->rq_reqmsg,
MSG_RESENT);
if (req->rq_bulk) {
old_xid, req->rq_xid);
}
}
-check_ctx:
+ /*
+ * rq_wait_ctx is only touched by ptlrpcd,
+ * so no lock is needed here.
+ */
status = sptlrpc_req_refresh_ctx(req, -1);
if (status) {
if (req->rq_err) {
req->rq_status = status;
force_timer_recalc = 1;
- }
- if (!req->rq_wait_ctx) {
- /* begins timeout counting */
- req->rq_sent = cfs_time_current_sec();
+ } else {
req->rq_wait_ctx = 1;
}
+
continue;
} else {
- req->rq_sent = 0;
req->rq_wait_ctx = 0;
}
ptlrpc_unregister_bulk (req);
if (req->rq_interpret_reply != NULL) {
- int (*interpreter)(struct ptlrpc_request *,void *,int) =
+ ptlrpc_interpterer_t interpreter =
req->rq_interpret_reply;
- req->rq_status = interpreter(req, &req->rq_async_args,
+ req->rq_status = interpreter(NULL, req,
+ &req->rq_async_args,
req->rq_status);
}
req->rq_phase = RQ_PHASE_COMPLETE;
spin_lock(&req->rq_lock);
req->rq_timedout = 1;
- req->rq_wait_ctx = 0;
spin_unlock(&req->rq_lock);
ptlrpc_unregister_reply (req);
if (req->rq_timedout) /* already timed out */
continue;
+ if (req->rq_wait_ctx) /* waiting for ctx */
+ continue;
+
if (req->rq_phase == RQ_PHASE_NEW)
deadline = req->rq_sent;
else
lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout ? timeout : 1),
ptlrpc_expired_set,
ptlrpc_interrupted_set, set);
- rc = l_wait_event(set->set_waitq, ptlrpc_check_set(set), &lwi);
+ rc = l_wait_event(set->set_waitq,
+ ptlrpc_check_set(NULL, set), &lwi);
LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
EXIT;
}
-void ptlrpc_free_req(struct ptlrpc_request *request)
-{
- __ptlrpc_free_req(request, 0);
-}
-
static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
{
"-- sleeping for "CFS_DURATION_T" ticks", timeout);
lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
req);
- rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
- if (rc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
+ brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
+ if (brc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
ptlrpc_check_and_wait_suspend(req)))
goto repeat;
if (req->rq_err) {
DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d",
rc, req->rq_status);
- GOTO(out, rc = -EIO);
+ GOTO(out, rc = rc ? rc : -EIO);
}
if (req->rq_intr) {
int praa_old_status;
};
-static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
void * data, int rc)
{
struct ptlrpc_replay_async_args *aa = data;
}
}
-static __u64 ptlrpc_last_xid = 0;
-spinlock_t ptlrpc_last_xid_lock;
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/* Initialize the XID for the node. This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing. It does not need to be sequential. Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would cause old to be delivered into the wrong buffer) we initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+ time_t now = cfs_time_current_sec();
+
+ spin_lock_init(&ptlrpc_last_xid_lock);
+ if (now < YEAR_2004) {
+ ll_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+ ptlrpc_last_xid >>= 2;
+ ptlrpc_last_xid |= (1ULL << 61);
+ } else {
+ ptlrpc_last_xid = (now << 20);
+ }
+}
__u64 ptlrpc_next_xid(void)
{
__u64 ptlrpc_sample_next_xid(void)
{
+#if BITS_PER_LONG == 32
+ /* need to avoid possible word tearing on 32-bit systems */
__u64 tmp;
spin_lock(&ptlrpc_last_xid_lock);
tmp = ptlrpc_last_xid + 1;
spin_unlock(&ptlrpc_last_xid_lock);
return tmp;
+#else
+ /* No need to lock, since returned value is racy anyways */
+ return ptlrpc_last_xid + 1;
+#endif
}
EXPORT_SYMBOL(ptlrpc_sample_next_xid);