* in the LICENSE file that accompanied this code).
*
* You should have received a copy of the GNU General Public License
- * version 2 along with this program; If not, see [sun.com URL with a
- * copy of GPLv2].
+ * version 2 along with this program; If not, see
+ * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
*
* Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
* CA 95054 USA or visit www.sun.com if you need additional information or
return NULL;
}
- c = ptlrpc_get_connection(peer, self, uuid);
+ c = ptlrpc_connection_get(peer, self, uuid);
if (c) {
memcpy(c->c_remote_uuid.uuid,
uuid->uuid, sizeof(c->c_remote_uuid.uuid));
return c;
}
-void ptlrpc_readdress_connection(struct ptlrpc_connection *conn,
- struct obd_uuid *uuid)
-{
- lnet_nid_t self;
- lnet_process_id_t peer;
- int err;
-
- err = ptlrpc_uuid_to_peer(uuid, &peer, &self);
- if (err != 0) {
- CERROR("cannot find peer %s!\n", uuid->uuid);
- return;
- }
-
- conn->c_peer = peer;
- conn->c_self = self;
- return;
-}
-
static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal)
{
struct ptlrpc_bulk_desc *desc;
cfs_waitq_init(&desc->bd_waitq);
desc->bd_max_iov = npages;
desc->bd_iov_count = 0;
- desc->bd_md_h = LNET_INVALID_HANDLE;
+ LNetInvalidateHandle(&desc->bd_md_h);
desc->bd_portal = portal;
desc->bd_type = type;
idx = import_at_get_index(req->rq_import,
req->rq_request_portal);
serv_est = at_get(&at->iat_service_estimate[idx]);
- /* add an arbitrary minimum: 125% +5 sec */
- req->rq_timeout = serv_est + (serv_est >> 2) + 5;
+ req->rq_timeout = at_est2timeout(serv_est);
/* We could get even fancier here, using history to predict increased
loading... */
}
/* Adjust max service estimate based on server value */
-static void ptlrpc_at_adj_service(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_service(struct ptlrpc_request *req,
+ unsigned int serv_est)
{
int idx;
- unsigned int serv_est, oldse;
- struct imp_at *at = &req->rq_import->imp_at;
+ unsigned int oldse;
+ struct imp_at *at;
- LASSERT(req->rq_import);
+ /* do estimate only if is not in recovery */
+ if (!(req->rq_send_state & (LUSTRE_IMP_FULL | LUSTRE_IMP_CONNECTING)))
+ return;
- /* service estimate is returned in the repmsg timeout field,
- may be 0 on err */
- serv_est = lustre_msg_get_timeout(req->rq_repmsg);
+ LASSERT(req->rq_import);
+ at = &req->rq_import->imp_at;
idx = import_at_get_index(req->rq_import, req->rq_request_portal);
/* max service estimates are tracked on the server side,
}
/* Adjust expected network latency */
-static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req)
+static void ptlrpc_at_adj_net_latency(struct ptlrpc_request *req,
+ unsigned int service_time)
{
- unsigned int st, nl, oldnl;
- struct imp_at *at = &req->rq_import->imp_at;
+ unsigned int nl, oldnl;
+ struct imp_at *at;
time_t now = cfs_time_current_sec();
LASSERT(req->rq_import);
-
- st = lustre_msg_get_service_time(req->rq_repmsg);
+ at = &req->rq_import->imp_at;
/* Network latency is total time less server processing time */
- nl = max_t(int, now - req->rq_sent - st, 0) + 1/*st rounding*/;
- if (st > now - req->rq_sent + 2 /* rounding */)
- CERROR("Reported service time %u > total measured time %ld\n",
- st, now - req->rq_sent);
+ nl = max_t(int, now - req->rq_sent - service_time, 0) +1/*st rounding*/;
+ if (service_time > now - req->rq_sent + 3 /* bz16408 */)
+ CWARN("Reported service time %u > total measured time "
+ CFS_DURATION_T"\n", service_time,
+ cfs_time_sub(now, req->rq_sent));
oldnl = at_add(&at->iat_net_latency, nl);
if (oldnl != 0)
* Handle an early reply message, called with the rq_lock held.
* If anything goes wrong just ignore it - same as if it never happened
*/
-static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req) {
- time_t olddl;
- int rc;
+static int ptlrpc_at_recv_early_reply(struct ptlrpc_request *req)
+{
+ struct ptlrpc_request *early_req;
+ time_t olddl;
+ int rc;
ENTRY;
req->rq_early = 0;
spin_unlock(&req->rq_lock);
- rc = sptlrpc_cli_unwrap_early_reply(req);
- if (rc)
- GOTO(out, rc);
+ rc = sptlrpc_cli_unwrap_early_reply(req, &early_req);
+ if (rc) {
+ spin_lock(&req->rq_lock);
+ RETURN(rc);
+ }
- rc = unpack_reply(req);
- if (rc)
- GOTO(out_cleanup, rc);
+ rc = unpack_reply(early_req);
+ if (rc == 0) {
+ /* Expecting to increase the service time estimate here */
+ ptlrpc_at_adj_service(req,
+ lustre_msg_get_timeout(early_req->rq_repmsg));
+ ptlrpc_at_adj_net_latency(req,
+ lustre_msg_get_service_time(early_req->rq_repmsg));
+ }
- /* Expecting to increase the service time estimate here */
- ptlrpc_at_adj_service(req);
- ptlrpc_at_adj_net_latency(req);
+ sptlrpc_cli_finish_early_reply(early_req);
- /* Adjust the local timeout for this req */
- ptlrpc_at_set_req_timeout(req);
+ spin_lock(&req->rq_lock);
- olddl = req->rq_deadline;
- /* server assumes it now has rq_timeout from when it sent the
- early reply, so client should give it at least that long. */
- req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
- ptlrpc_at_get_net_latency(req);
+ if (rc == 0) {
+ /* Adjust the local timeout for this req */
+ ptlrpc_at_set_req_timeout(req);
- DEBUG_REQ(D_ADAPTTO, req,
- "Early reply #%d, new deadline in %lds (%+lds)",
- req->rq_early_count, req->rq_deadline -
- cfs_time_current_sec(), req->rq_deadline - olddl);
+ olddl = req->rq_deadline;
+ /* server assumes it now has rq_timeout from when it sent the
+ early reply, so client should give it at least that long. */
+ req->rq_deadline = cfs_time_current_sec() + req->rq_timeout +
+ ptlrpc_at_get_net_latency(req);
+
+ DEBUG_REQ(D_ADAPTTO, req,
+ "Early reply #%d, new deadline in "CFS_DURATION_T"s "
+ "("CFS_DURATION_T"s)", req->rq_early_count,
+ cfs_time_sub(req->rq_deadline,
+ cfs_time_current_sec()),
+ cfs_time_sub(req->rq_deadline, olddl));
+ }
-out_cleanup:
- sptlrpc_cli_finish_early_reply(req);
-out:
- spin_lock(&req->rq_lock);
RETURN(rc);
}
request->rq_reqbuf = reqbuf;
request->rq_reqbuf_len = pool->prp_rq_size;
request->rq_pool = pool;
+
return request;
}
EXPORT_SYMBOL(ptlrpc_request_bufs_pack);
int ptlrpc_request_pack(struct ptlrpc_request *request,
- __u32 version, int opcode)
+ __u32 version, int opcode)
{
return ptlrpc_request_bufs_pack(request, version, opcode, NULL, NULL);
}
OBD_ALLOC_PTR(request);
if (request) {
- LASSERT((unsigned long)imp > 0x1000);
+ LASSERTF((unsigned long)imp > 0x1000, "%p", imp);
LASSERT(imp != LP_POISON);
- LASSERT((unsigned long)imp->imp_client > 0x1000);
+ LASSERTF((unsigned long)imp->imp_client > 0x1000, "%p",
+ imp->imp_client);
LASSERT(imp->imp_client != LP_POISON);
request->rq_import = class_import_get(imp);
spin_lock_init(&set->set_new_req_lock);
CFS_INIT_LIST_HEAD(&set->set_new_requests);
CFS_INIT_LIST_HEAD(&set->set_cblist);
-
+
RETURN(set);
}
if (req->rq_phase == RQ_PHASE_NEW) {
if (req->rq_interpret_reply != NULL) {
- int (*interpreter)(struct ptlrpc_request *,
- void *, int) =
+ ptlrpc_interpterer_t interpreter =
req->rq_interpret_reply;
/* higher level (i.e. LOV) failed;
* let the sub reqs clean up */
req->rq_status = -EBADR;
- interpreter(req, &req->rq_async_args,
+ interpreter(NULL, req, &req->rq_async_args,
req->rq_status);
}
set->set_remaining--;
atomic_inc(&req->rq_import->imp_inflight);
}
-/* lock so many callers can add things, the context that owns the set
- * is supposed to notice these and move them into the set proper. */
-void ptlrpc_set_add_new_req(struct ptlrpc_request_set *set,
- struct ptlrpc_request *req)
+/**
+ * Lock so many callers can add things, the context that owns the set
+ * is supposed to notice these and move them into the set proper.
+ */
+int ptlrpc_set_add_new_req(struct ptlrpcd_ctl *pc,
+ struct ptlrpc_request *req)
{
+ struct ptlrpc_request_set *set = pc->pc_set;
+
+ /*
+ * Let caller know that we stopped and will not handle this request.
+ * It needs to take care itself of request.
+ */
+ if (test_bit(LIOD_STOP, &pc->pc_flags))
+ return -EALREADY;
+
spin_lock(&set->set_new_req_lock);
- /* The set takes over the caller's request reference */
+ /*
+ * The set takes over the caller's request reference.
+ */
list_add_tail(&req->rq_set_chain, &set->set_new_requests);
req->rq_set = set;
spin_unlock(&set->set_new_req_lock);
+
+ /*
+ * Let thead know that we added something and better it to wake up
+ * and process.
+ */
+ cfs_waitq_signal(&set->set_waitq);
+ return 0;
}
/*
/*
* NB Until this point, the whole of the incoming message,
- * including buflens, status etc is in the sender's byte order.
+ * including buflens, status etc is in the sender's byte order.
*/
rc = sptlrpc_cli_unwrap_reply(req);
}
/*
- * Security layer unwrap might ask resend this request.
+ * Security layer unwrap might ask resend this request.
*/
if (req->rq_resend)
RETURN(0);
}
OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_PAUSE_REP, obd_fail_val);
- ptlrpc_at_adj_service(req);
- ptlrpc_at_adj_net_latency(req);
+ ptlrpc_at_adj_service(req, lustre_msg_get_timeout(req->rq_repmsg));
+ ptlrpc_at_adj_net_latency(req,
+ lustre_msg_get_service_time(req->rq_repmsg));
rc = ptlrpc_check_status(req);
imp->imp_connect_error = rc;
/*
* Either we've been evicted, or the server has failed for
* some reason. Try to reconnect, and if that fails, punt to
- * the upcall.
+ * the upcall.
*/
if (ll_rpc_recoverable_error(rc)) {
if (req->rq_send_state != LUSTRE_IMP_FULL ||
}
} else {
/*
- * Let's look if server sent slv. Do it only for RPC with
- * rc == 0.
+ * Let's look if server sent slv. Do it only for RPC with
+ * rc == 0.
*/
ldlm_cli_update_pool(req);
}
/*
- * Store transno in reqmsg for replay.
+ * Store transno in reqmsg for replay.
*/
req->rq_transno = lustre_msg_get_transno(req->rq_repmsg);
lustre_msg_set_transno(req->rq_reqmsg, req->rq_transno);
spin_lock(&imp->imp_lock);
/*
* No point in adding already-committed requests to the replay
- * list, we will just remove them immediately. b=9829
+ * list, we will just remove them immediately. b=9829
*/
- if (req->rq_transno != 0 &&
- (req->rq_transno >
+ if (req->rq_transno != 0 &&
+ (req->rq_transno >
lustre_msg_get_last_committed(req->rq_repmsg) ||
req->rq_replay))
ptlrpc_retain_replayable_request(req, imp);
}
/*
- * Replay-enabled imports return commit-status information.
+ * Replay-enabled imports return commit-status information.
*/
if (lustre_msg_get_last_committed(req->rq_repmsg)) {
imp->imp_peer_committed_transno =
LASSERT(req->rq_phase == RQ_PHASE_NEW);
if (req->rq_sent && (req->rq_sent > cfs_time_current_sec()))
RETURN (0);
-
+
req->rq_phase = RQ_PHASE_RPC;
imp = req->rq_import;
req->rq_status = rc;
RETURN(1);
} else {
- /* here begins timeout counting */
- req->rq_sent = cfs_time_current_sec();
req->rq_wait_ctx = 1;
RETURN(0);
}
}
/* this sends any unsent RPCs in @set and returns TRUE if all are sent */
-int ptlrpc_check_set(struct ptlrpc_request_set *set)
+int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set)
{
struct list_head *tmp;
int force_timer_recalc = 0;
req->rq_waiting || req->rq_wait_ctx) {
int status;
- /* rq_wait_ctx is only touched in ptlrpcd,
- * no lock needed here.
- */
- if (req->rq_wait_ctx)
- goto check_ctx;
-
ptlrpc_unregister_reply(req);
spin_lock(&imp->imp_lock);
spin_unlock(&imp->imp_lock);
req->rq_waiting = 0;
- if (req->rq_resend) {
+
+ if (req->rq_timedout||req->rq_resend) {
+ /* This is re-sending anyways,
+ * let's mark req as resend. */
+ req->rq_resend = 1;
lustre_msg_add_flags(req->rq_reqmsg,
MSG_RESENT);
if (req->rq_bulk) {
old_xid, req->rq_xid);
}
}
-check_ctx:
+ /*
+ * rq_wait_ctx is only touched by ptlrpcd,
+ * so no lock is needed here.
+ */
status = sptlrpc_req_refresh_ctx(req, -1);
if (status) {
if (req->rq_err) {
req->rq_status = status;
force_timer_recalc = 1;
- }
- if (!req->rq_wait_ctx) {
- /* begins timeout counting */
- req->rq_sent = cfs_time_current_sec();
+ } else {
req->rq_wait_ctx = 1;
}
+
continue;
} else {
- req->rq_sent = 0;
req->rq_wait_ctx = 0;
}
ptlrpc_unregister_bulk (req);
if (req->rq_interpret_reply != NULL) {
- int (*interpreter)(struct ptlrpc_request *,void *,int) =
+ ptlrpc_interpterer_t interpreter =
req->rq_interpret_reply;
- req->rq_status = interpreter(req, &req->rq_async_args,
+ req->rq_status = interpreter(NULL, req,
+ &req->rq_async_args,
req->rq_status);
}
req->rq_phase = RQ_PHASE_COMPLETE;
ENTRY;
DEBUG_REQ(D_ERROR|D_NETERROR, req,
- "%s (sent at %lu, "CFS_DURATION_T"s ago)",
+ "%s (sent at "CFS_TIME_T", "CFS_DURATION_T"s ago)",
req->rq_net_err ? "network error" : "timeout",
- (long)req->rq_sent, cfs_time_current_sec() - req->rq_sent);
+ req->rq_sent, cfs_time_sub(cfs_time_current_sec(),
+ req->rq_sent));
if (imp) {
- LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s %lus ago"
- " has timed out (limit %lus).\n", req->rq_xid,
+ LCONSOLE_WARN("Request x"LPU64" sent from %s to NID %s "
+ CFS_DURATION_T"s ago has timed out "
+ "(limit "CFS_DURATION_T"s).\n", req->rq_xid,
req->rq_import->imp_obd->obd_name,
libcfs_nid2str(imp->imp_connection->c_peer.nid),
- cfs_time_current_sec() - req->rq_sent,
- req->rq_deadline - req->rq_sent);
+ cfs_time_sub(cfs_time_current_sec(), req->rq_sent),
+ cfs_time_sub(req->rq_deadline, req->rq_sent));
}
if (imp != NULL && obd_debug_peer_on_timeout)
spin_lock(&req->rq_lock);
req->rq_timedout = 1;
- req->rq_wait_ctx = 0;
spin_unlock(&req->rq_lock);
ptlrpc_unregister_reply (req);
spin_unlock(&req->rq_lock);
RETURN(1);
}
-
+
/* if a request can't be resent we can't wait for an answer after
the timeout */
if (req->rq_no_resend) {
if (req->rq_timedout) /* already timed out */
continue;
+ if (req->rq_wait_ctx) /* waiting for ctx */
+ continue;
+
if (req->rq_phase == RQ_PHASE_NEW)
deadline = req->rq_sent;
else
lwi = LWI_TIMEOUT_INTR(cfs_time_seconds(timeout ? timeout : 1),
ptlrpc_expired_set,
ptlrpc_interrupted_set, set);
- rc = l_wait_event(set->set_waitq, ptlrpc_check_set(set), &lwi);
+ rc = l_wait_event(set->set_waitq,
+ ptlrpc_check_set(NULL, set), &lwi);
LASSERT(rc == 0 || rc == -EINTR || rc == -ETIMEDOUT);
LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */
LASSERTF(list_empty(&request->rq_list), "req %p\n", request);
LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request);
+ LASSERTF(!request->rq_replay, "req %p\n", request);
LASSERT(request->rq_cli_ctx);
req_capsule_fini(&request->rq_pill);
EXIT;
}
-void ptlrpc_free_req(struct ptlrpc_request *request)
-{
- __ptlrpc_free_req(request, 0);
-}
-
static int __ptlrpc_req_finished(struct ptlrpc_request *request, int locked);
void ptlrpc_req_finished_with_imp_lock(struct ptlrpc_request *request)
{
"-- sleeping for "CFS_DURATION_T" ticks", timeout);
lwi = LWI_TIMEOUT_INTR(timeout, expired_request, interrupted_request,
req);
- rc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
- if (rc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
+ brc = l_wait_event(req->rq_reply_waitq, ptlrpc_check_reply(req), &lwi);
+ if (brc == -ETIMEDOUT && ((req->rq_deadline > cfs_time_current_sec()) ||
ptlrpc_check_and_wait_suspend(req)))
goto repeat;
if (req->rq_err) {
DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d",
rc, req->rq_status);
- GOTO(out, rc = -EIO);
+ GOTO(out, rc = rc ? rc : -EIO);
}
if (req->rq_intr) {
int praa_old_status;
};
-static int ptlrpc_replay_interpret(struct ptlrpc_request *req,
+static int ptlrpc_replay_interpret(const struct lu_env *env,
+ struct ptlrpc_request *req,
void * data, int rc)
{
struct ptlrpc_replay_async_args *aa = data;
EXIT;
}
-static __u64 ptlrpc_last_xid = 0;
-spinlock_t ptlrpc_last_xid_lock;
+void ptlrpc_abort_set(struct ptlrpc_request_set *set)
+{
+ struct list_head *tmp, *n;
+
+ LASSERT(set != NULL);
+
+ list_for_each_safe(tmp, n, &set->set_requests) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request, rq_set_chain);
+
+ spin_lock (&req->rq_lock);
+ if (req->rq_phase != RQ_PHASE_RPC) {
+ spin_unlock (&req->rq_lock);
+ continue;
+ }
+
+ req->rq_err = 1;
+ req->rq_status = -EINTR;
+ ptlrpc_wake_client_req(req);
+ spin_unlock (&req->rq_lock);
+ }
+}
+
+static __u64 ptlrpc_last_xid;
+static spinlock_t ptlrpc_last_xid_lock;
+
+/* Initialize the XID for the node. This is common among all requests on
+ * this node, and only requires the property that it is monotonically
+ * increasing. It does not need to be sequential. Since this is also used
+ * as the RDMA match bits, it is important that a single client NOT have
+ * the same match bits for two different in-flight requests, hence we do
+ * NOT want to have an XID per target or similar.
+ *
+ * To avoid an unlikely collision between match bits after a client reboot
+ * (which would cause old to be delivered into the wrong buffer) we initialize
+ * the XID based on the current time, assuming a maximum RPC rate of 1M RPC/s.
+ * If the time is clearly incorrect, we instead use a 62-bit random number.
+ * In the worst case the random number will overflow 1M RPCs per second in
+ * 9133 years, or permutations thereof.
+ */
+#define YEAR_2004 (1ULL << 30)
+void ptlrpc_init_xid(void)
+{
+ time_t now = cfs_time_current_sec();
+
+ spin_lock_init(&ptlrpc_last_xid_lock);
+ if (now < YEAR_2004) {
+ ll_get_random_bytes(&ptlrpc_last_xid, sizeof(ptlrpc_last_xid));
+ ptlrpc_last_xid >>= 2;
+ ptlrpc_last_xid |= (1ULL << 61);
+ } else {
+ ptlrpc_last_xid = (now << 20);
+ }
+}
__u64 ptlrpc_next_xid(void)
{
__u64 ptlrpc_sample_next_xid(void)
{
+#if BITS_PER_LONG == 32
+ /* need to avoid possible word tearing on 32-bit systems */
__u64 tmp;
spin_lock(&ptlrpc_last_xid_lock);
tmp = ptlrpc_last_xid + 1;
spin_unlock(&ptlrpc_last_xid_lock);
return tmp;
+#else
+ /* No need to lock, since returned value is racy anyways */
+ return ptlrpc_last_xid + 1;
+#endif
}
EXPORT_SYMBOL(ptlrpc_sample_next_xid);