summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
d6fd48f)
In ptlrpc layer, if the ptlrpc_request::rq_allow_replay is set,
then such RPC can be sent to remote peer even if it is not the
replay RPC during the remote server recovery. Such flag is used
for sending RPC under the case of current server and the remote
server are both in recovery.
On the other hand, abusing such flag will cause some trouble.
For example: consider DNE mode, assume the MDT_m is in recover,
the MDT_n is healthy. At that time, one client can send a normal
reint unlink RPC to the MDT_n to remove the file_A (that resides
on the MDT_n) under the dir_B (that resides on the MDT_m). Under
such case, the MDT_n needs to lookup the dir_B with the file_A's
name, means the MDT_n needs to send lookup OUT RPC to the MDT_m,
but before that it needs to lock the dir_B with LDLM_ENQUEUE RPC
firstly. Because the MDT_m is recovering, since the LDLM_ENQUEUE
RPC is not for replay, it should be blocked until the recovery
done on the MDT_m. That is expected behavior. But if the MDT_n
(via OSP) sets ptlrpc_request::rq_allow_replay improperly, then
such LDLM_ENQUEUE RPC may be sent to the MDT_m during the MDT_m
recovery and granted without conflict. And then the subsequent
lookup OUT RPC may obtain some stale information from the MDT_m
if the dir_B has NOT been recovered yet.
So the ptlrpc_request::rq_allow_replay will be set during current
MDT recovery. On the other hand, there are multiple threads those
are related with the recovery, such as target_recovery_thread and
lod_sub_recovery_thread. Because the obd_device::obd_recovering
is controlled by the target_recovery_thread that is started later
than the lod_sub_recovery_thread. Only checking the obd_recovering
flag does not work under some cases. So it needs to check other
flags: obd_device::obd_replayable and obd_device::obd_no_conn to
distinguish recovery related RPC properly.
So for above case, the client sponsored unlink will be blocked on
the MDT_n for the LDLM_ENQUEUE RPC until the MDT_m recovery done.
Test-Parameters: mdtfilesystemtype=ldiskfs mdsfilesystemtype=ldiskfs ostfilesystemtype=ldiskfs mdscount=2 mdtcount=4 testlist=replay-single,replay-single,replay-single
Signed-off-by: Fan Yong <fan.yong@intel.com>
Change-Id: Id9ac542751cc0042fba0a94166dfc57ace52dc69
Reviewed-on: http://review.whamcloud.com/20940
Tested-by: Jenkins
Reviewed-by: Alex Zhuravlev <alexey.zhuravlev@intel.com>
Tested-by: Maloo <hpdd-maloo@intel.com>
Reviewed-by: wangdi <di.wang@intel.com>
Reviewed-by: Lai Siyao <lai.siyao@intel.com>
Reviewed-by: Oleg Drokin <oleg.drokin@intel.com>
extern struct obd_ops lwp_obd_device_ops;
extern struct lu_device_type lwp_device_type;
extern struct obd_ops lwp_obd_device_ops;
extern struct lu_device_type lwp_device_type;
+static inline struct lu_device *osp2top(const struct osp_device *osp)
+{
+ return osp->opd_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
+}
+
+static inline void osp_set_req_replay(const struct osp_device *osp,
+ struct ptlrpc_request *req)
+{
+ struct obd_device *obd = osp2top(osp)->ld_obd;
+
+ /* The RPC must be recovery related for the cases:
+ *
+ * 1. sent during recovery, or
+ * 2. sent before the recovery thread target_recovery_thread() start,
+ * such as triggered by lod_sub_recovery_thread(). */
+ if (obd->obd_recovering || (obd->obd_replayable && obd->obd_no_conn))
+ req->rq_allow_replay = 1;
+}
+
union ldlm_policy_data *policy)
{
struct ldlm_res_id *res_id;
union ldlm_policy_data *policy)
{
struct ldlm_res_id *res_id;
- struct dt_device *dt_dev = lu2dt_dev(dt->do_lu.lo_dev);
- struct osp_device *osp = dt2osp_dev(dt_dev);
- struct lu_device *top_device;
+ struct osp_device *osp = dt2osp_dev(lu2dt_dev(dt->do_lu.lo_dev));
struct ptlrpc_request *req;
int rc = 0;
__u64 flags = LDLM_FL_NO_LRU;
struct ptlrpc_request *req;
int rc = 0;
__u64 flags = LDLM_FL_NO_LRU;
res_id = einfo->ei_res_id;
LASSERT(res_id != NULL);
res_id = einfo->ei_res_id;
LASSERT(res_id != NULL);
if (IS_ERR(req))
RETURN(PTR_ERR(req));
if (IS_ERR(req))
RETURN(PTR_ERR(req));
- /* During recovery, it needs to let OSP send enqueue
- * without checking recoverying status, in case the
- * other target is being recovered at the same time,
- * and if we wait here for the import to be recovered,
- * it might cause deadlock */
- top_device = dt_dev->dd_lu_dev.ld_site->ls_top_dev;
- if (top_device->ld_obd->obd_recovering)
- req->rq_allow_replay = 1;
-
+ osp_set_req_replay(osp, req);
rc = ldlm_cli_enqueue(osp->opd_exp, &req, einfo, res_id,
(const union ldlm_policy_data *)policy,
&flags, NULL, 0, LVB_T_NONE, lh, 0);
ptlrpc_req_finished(req);
rc = ldlm_cli_enqueue(osp->opd_exp, &req, einfo, res_id,
(const union ldlm_policy_data *)policy,
&flags, NULL, 0, LVB_T_NONE, lh, 0);
ptlrpc_req_finished(req);
- return rc == ELDLM_OK ? 0 : -EIO;
+ RETURN(rc == ELDLM_OK ? 0 : -EIO);
- /* This will only be called with read-only update, and these updates
- * might be used to retrieve update log during recovery process, so
- * it will be allowed to send during recovery process */
- req->rq_allow_replay = 1;
+ osp_set_req_replay(osp, req);
req->rq_bulk_read = 1;
/* send request to master and wait for RPC to complete */
rc = ptlrpc_queue_wait(req);
req->rq_bulk_read = 1;
/* send request to master and wait for RPC to complete */
rc = ptlrpc_queue_wait(req);
struct lu_device *dev = it->ooi_obj->do_lu.lo_dev;
struct osp_device *osp = lu2osp_dev(dev);
struct page **pages;
struct lu_device *dev = it->ooi_obj->do_lu.lo_dev;
struct osp_device *osp = lu2osp_dev(dev);
struct page **pages;
- struct lu_device *top_device;
struct ptlrpc_request *req = NULL;
struct ptlrpc_bulk_desc *desc;
struct idx_info *ii;
struct ptlrpc_request *req = NULL;
struct ptlrpc_bulk_desc *desc;
struct idx_info *ii;
- /* Let's allow this request during recovery, otherwise
- * if the remote target is also in recovery status,
- * it might cause deadlock */
- top_device = dev->ld_site->ls_top_dev;
- if (top_device->ld_obd->obd_recovering)
- req->rq_allow_replay = 1;
-
+ osp_set_req_replay(osp, req);
req->rq_request_portal = OUT_PORTAL;
ii = req_capsule_client_get(&req->rq_pill, &RMF_IDX_INFO);
memset(ii, 0, sizeof(*ii));
req->rq_request_portal = OUT_PORTAL;
ii = req_capsule_client_get(&req->rq_pill, &RMF_IDX_INFO);
memset(ii, 0, sizeof(*ii));
- /* This will only be called with read-only update, and these updates
- * might be used to retrieve update log during recovery process, so
- * it will be allowed to send during recovery process */
- req->rq_allow_replay = 1;
+ osp_set_req_replay(osp, req);
req->rq_allow_intr = 1;
/* Note: some dt index api might return non-zero result here, like
req->rq_allow_intr = 1;
/* Note: some dt index api might return non-zero result here, like
{
struct osp_update_args *args;
struct ptlrpc_request *req;
{
struct osp_update_args *args;
struct ptlrpc_request *req;
- struct lu_device *top_device;
struct osp_thandle *oth = our->our_th;
int rc = 0;
ENTRY;
struct osp_thandle *oth = our->our_th;
int rc = 0;
ENTRY;
* status, in case the other target is being recoveried
* at the same time, and if we wait here for the import
* to be recoveryed, it might cause deadlock */
* status, in case the other target is being recoveried
* at the same time, and if we wait here for the import
* to be recoveryed, it might cause deadlock */
- top_device = osp->opd_dt_dev.dd_lu_dev.ld_site->ls_top_dev;
- if (top_device->ld_obd->obd_recovering)
- req->rq_allow_replay = 1;
+ osp_set_req_replay(osp, req);
/* Because this req will be synchronus, i.e. it will be called
* in the same thread, so it will be safe to use current
/* Because this req will be synchronus, i.e. it will be called
* in the same thread, so it will be safe to use current