Whamcloud - gitweb
b=18881
authorjxiong <jxiong>
Fri, 28 Aug 2009 02:05:01 +0000 (02:05 +0000)
committerjxiong <jxiong>
Fri, 28 Aug 2009 02:05:01 +0000 (02:05 +0000)
r=wangdi,eric.mei

6th patch for 18881, fixed a race condition which may cause the system enterring into a live lock state

lustre/include/cl_object.h
lustre/include/lclient.h
lustre/lclient/lcommon_cl.c
lustre/llite/vvp_page.c
lustre/lov/lov_page.c
lustre/obdclass/cl_lock.c
lustre/obdclass/cl_page.c
lustre/osc/osc_io.c
lustre/osc/osc_lock.c
lustre/osc/osc_request.c

index 9f03968..734d0f9 100644 (file)
@@ -832,8 +832,9 @@ struct cl_page_operations {
          * \see cl_page_own()
          * \see vvp_page_own(), lov_page_own()
          */
          * \see cl_page_own()
          * \see vvp_page_own(), lov_page_own()
          */
-        void (*cpo_own)(const struct lu_env *env,
-                        const struct cl_page_slice *slice, struct cl_io *io);
+        int  (*cpo_own)(const struct lu_env *env,
+                        const struct cl_page_slice *slice,
+                        struct cl_io *io, int nonblock);
         /** Called when ownership it yielded. Optional.
          *
          * \see cl_page_disown()
         /** Called when ownership it yielded. Optional.
          *
          * \see cl_page_disown()
@@ -2646,7 +2647,8 @@ void                  cl_page_gang_lookup(const struct lu_env *env,
                                           struct cl_object *obj,
                                           struct cl_io *io,
                                           pgoff_t start, pgoff_t end,
                                           struct cl_object *obj,
                                           struct cl_io *io,
                                           pgoff_t start, pgoff_t end,
-                                          struct cl_page_list *plist);
+                                          struct cl_page_list *plist,
+                                          int nonblock);
 struct cl_page *cl_page_find        (const struct lu_env *env,
                                      struct cl_object *obj,
                                      pgoff_t idx, struct page *vmpage,
 struct cl_page *cl_page_find        (const struct lu_env *env,
                                      struct cl_object *obj,
                                      pgoff_t idx, struct page *vmpage,
@@ -2678,6 +2680,8 @@ const struct cl_page_slice *cl_page_at(const struct cl_page *page,
 
 int  cl_page_own        (const struct lu_env *env,
                          struct cl_io *io, struct cl_page *page);
 
 int  cl_page_own        (const struct lu_env *env,
                          struct cl_io *io, struct cl_page *page);
+int  cl_page_own_try    (const struct lu_env *env,
+                         struct cl_io *io, struct cl_page *page);
 void cl_page_assume     (const struct lu_env *env,
                          struct cl_io *io, struct cl_page *page);
 void cl_page_unassume   (const struct lu_env *env,
 void cl_page_assume     (const struct lu_env *env,
                          struct cl_io *io, struct cl_page *page);
 void cl_page_unassume   (const struct lu_env *env,
index 8f6aef3..4df7c2b 100644 (file)
@@ -292,9 +292,9 @@ int ccc_page_is_under_lock(const struct lu_env *env,
                            const struct cl_page_slice *slice, struct cl_io *io);
 int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
 void ccc_transient_page_verify(const struct cl_page *page);
                            const struct cl_page_slice *slice, struct cl_io *io);
 int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice);
 void ccc_transient_page_verify(const struct cl_page *page);
-void ccc_transient_page_own(const struct lu_env *env,
+int  ccc_transient_page_own(const struct lu_env *env,
                             const struct cl_page_slice *slice,
                             const struct cl_page_slice *slice,
-                            struct cl_io *io);
+                            struct cl_io *io, int nonblock);
 void ccc_transient_page_assume(const struct lu_env *env,
                                const struct cl_page_slice *slice,
                                struct cl_io *io);
 void ccc_transient_page_assume(const struct lu_env *env,
                                const struct cl_page_slice *slice,
                                struct cl_io *io);
index cce306a..d2a4d18 100644 (file)
@@ -503,11 +503,13 @@ void ccc_transient_page_verify(const struct cl_page *page)
 {
 }
 
 {
 }
 
-void ccc_transient_page_own(const struct lu_env *env,
+int ccc_transient_page_own(const struct lu_env *env,
                                    const struct cl_page_slice *slice,
                                    const struct cl_page_slice *slice,
-                                   struct cl_io *unused)
+                                   struct cl_io *unused,
+                                   int nonblock)
 {
         ccc_transient_page_verify(slice->cpl_page);
 {
         ccc_transient_page_verify(slice->cpl_page);
+        return 0;
 }
 
 void ccc_transient_page_assume(const struct lu_env *env,
 }
 
 void ccc_transient_page_assume(const struct lu_env *env,
index c49bb67..18d7fa5 100644 (file)
@@ -78,8 +78,9 @@ static void vvp_page_fini(const struct lu_env *env,
         vvp_page_fini_common(cp);
 }
 
         vvp_page_fini_common(cp);
 }
 
-static void vvp_page_own(const struct lu_env *env,
-                         const struct cl_page_slice *slice, struct cl_io *io)
+static int vvp_page_own(const struct lu_env *env,
+                        const struct cl_page_slice *slice, struct cl_io *io,
+                        int nonblock)
 {
         struct ccc_page *vpg    = cl2ccc_page(slice);
         cfs_page_t      *vmpage = vpg->cpg_page;
 {
         struct ccc_page *vpg    = cl2ccc_page(slice);
         cfs_page_t      *vmpage = vpg->cpg_page;
@@ -87,11 +88,24 @@ static void vvp_page_own(const struct lu_env *env,
 
         LASSERT(vmpage != NULL);
 
 
         LASSERT(vmpage != NULL);
 
+        if (nonblock) {
+                if (TestSetPageLocked(vmpage))
+                        return -EAGAIN;
+
+                if (unlikely(PageWriteback(vmpage))) {
+                        /* Something gets wrong? */
+                        unlock_page(vmpage);
+                        return -EAGAIN;
+                }
+
+                return 0;
+        }
+
         /* DEBUG CODE FOR #18881 */
         while (TestSetPageLocked(vmpage)) {
                 cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
                                      cfs_time_seconds(1)/10);
         /* DEBUG CODE FOR #18881 */
         while (TestSetPageLocked(vmpage)) {
                 cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE,
                                      cfs_time_seconds(1)/10);
-                if (++count > 600) {
+                if (++count > 1200) {
                         CL_PAGE_DEBUG(D_ERROR, env,
                                       cl_page_top(slice->cpl_page),
                                       "XXX page %p blocked on acquiring the"
                         CL_PAGE_DEBUG(D_ERROR, env,
                                       cl_page_top(slice->cpl_page),
                                       "XXX page %p blocked on acquiring the"
@@ -116,6 +130,7 @@ static void vvp_page_own(const struct lu_env *env,
 
         /* lock_page(vmpage); */
         wait_on_page_writeback(vmpage);
 
         /* lock_page(vmpage); */
         wait_on_page_writeback(vmpage);
+        return 0;
 }
 
 static void vvp_page_assume(const struct lu_env *env,
 }
 
 static void vvp_page_assume(const struct lu_env *env,
@@ -465,11 +480,12 @@ static void vvp_transient_page_verify(const struct cl_page *page)
         /* LASSERT_SEM_LOCKED(&inode->i_alloc_sem); */
 }
 
         /* LASSERT_SEM_LOCKED(&inode->i_alloc_sem); */
 }
 
-static void vvp_transient_page_own(const struct lu_env *env,
-                                   const struct cl_page_slice *slice,
-                                   struct cl_io *unused)
+static int vvp_transient_page_own(const struct lu_env *env,
+                                  const struct cl_page_slice *slice,
+                                  struct cl_io *unused, int nonblock)
 {
         vvp_transient_page_verify(slice->cpl_page);
 {
         vvp_transient_page_verify(slice->cpl_page);
+        return 0;
 }
 
 static void vvp_transient_page_assume(const struct lu_env *env,
 }
 
 static void vvp_transient_page_assume(const struct lu_env *env,
index 5d50f8a..5c449a0 100644 (file)
@@ -81,8 +81,9 @@ static void lov_page_fini(const struct lu_env *env,
         EXIT;
 }
 
         EXIT;
 }
 
-static void lov_page_own(const struct lu_env *env,
-                         const struct cl_page_slice *slice, struct cl_io *io)
+static int lov_page_own(const struct lu_env *env,
+                        const struct cl_page_slice *slice, struct cl_io *io,
+                        int nonblock)
 {
         struct lov_io     *lio = lov_env_io(env);
         struct lov_io_sub *sub;
 {
         struct lov_io     *lio = lov_env_io(env);
         struct lov_io_sub *sub;
@@ -97,13 +98,13 @@ static void lov_page_own(const struct lu_env *env,
                 lov_sub_put(sub);
         } else
                 LBUG(); /* Arrgh */
                 lov_sub_put(sub);
         } else
                 LBUG(); /* Arrgh */
-        EXIT;
+        RETURN(0);
 }
 
 static void lov_page_assume(const struct lu_env *env,
                             const struct cl_page_slice *slice, struct cl_io *io)
 {
 }
 
 static void lov_page_assume(const struct lu_env *env,
                             const struct cl_page_slice *slice, struct cl_io *io)
 {
-        return lov_page_own(env, slice, io);
+        lov_page_own(env, slice, io, 0);
 }
 
 static int lov_page_print(const struct lu_env *env,
 }
 
 static int lov_page_print(const struct lu_env *env,
index 9685ac1..d049754 100644 (file)
@@ -1813,9 +1813,12 @@ int cl_lock_page_out(const struct lu_env *env, struct cl_lock *lock,
         io->ci_obj = cl_object_top(descr->cld_obj);
         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
         if (result == 0) {
         io->ci_obj = cl_object_top(descr->cld_obj);
         result = cl_io_init(env, io, CIT_MISC, io->ci_obj);
         if (result == 0) {
+                int nonblock = 1;
+
+restart:
                 cl_2queue_init(queue);
                 cl_page_gang_lookup(env, descr->cld_obj, io, descr->cld_start,
                 cl_2queue_init(queue);
                 cl_page_gang_lookup(env, descr->cld_obj, io, descr->cld_start,
-                                    descr->cld_end, &queue->c2_qin);
+                                    descr->cld_end, &queue->c2_qin, nonblock);
                 page_count = queue->c2_qin.pl_nr;
                 if (page_count > 0) {
                         result = cl_page_list_unmap(env, io, &queue->c2_qin);
                 page_count = queue->c2_qin.pl_nr;
                 if (page_count > 0) {
                         result = cl_page_list_unmap(env, io, &queue->c2_qin);
@@ -1837,6 +1840,11 @@ int cl_lock_page_out(const struct lu_env *env, struct cl_lock *lock,
                         cl_2queue_disown(env, io, queue);
                 }
                 cl_2queue_fini(env, queue);
                         cl_2queue_disown(env, io, queue);
                 }
                 cl_2queue_fini(env, queue);
+
+                if (nonblock) {
+                        nonblock = 0;
+                        goto restart;
+                }
         }
         cl_io_fini(env, io);
         RETURN(result);
         }
         cl_io_fini(env, io);
         RETURN(result);
index 33aaf17..bbca002 100644 (file)
@@ -186,7 +186,7 @@ EXPORT_SYMBOL(cl_page_lookup);
  */
 void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
                          struct cl_io *io, pgoff_t start, pgoff_t end,
  */
 void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
                          struct cl_io *io, pgoff_t start, pgoff_t end,
-                         struct cl_page_list *queue)
+                         struct cl_page_list *queue, int nonblock)
 {
         struct cl_object_header *hdr;
         struct cl_page          *page;
 {
         struct cl_object_header *hdr;
         struct cl_page          *page;
@@ -197,8 +197,13 @@ void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
         unsigned int             nr;
         unsigned int             i;
         unsigned int             j;
         unsigned int             nr;
         unsigned int             i;
         unsigned int             j;
+        int                    (*page_own)(const struct lu_env *env,
+                                           struct cl_io *io,
+                                           struct cl_page *pg);
         ENTRY;
 
         ENTRY;
 
+        page_own = nonblock ? cl_page_own_try : cl_page_own;
+
         idx = start;
         hdr = cl_object_header(obj);
         pvec = cl_env_info(env)->clt_pvec;
         idx = start;
         hdr = cl_object_header(obj);
         pvec = cl_env_info(env)->clt_pvec;
@@ -251,7 +256,7 @@ void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj,
                 spin_unlock(&hdr->coh_page_guard);
                 for (i = 0; i < j; ++i) {
                         page = pvec[i];
                 spin_unlock(&hdr->coh_page_guard);
                 for (i = 0; i < j; ++i) {
                         page = pvec[i];
-                        if (cl_page_own(env, io, page) == 0)
+                        if (page_own(env, io, page) == 0)
                                 cl_page_list_add(queue, page);
                         lu_ref_del(&page->cp_reference,
                                    "page_list", cfs_current());
                                 cl_page_list_add(queue, page);
                         lu_ref_del(&page->cp_reference,
                                    "page_list", cfs_current());
@@ -890,7 +895,7 @@ int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io)
 EXPORT_SYMBOL(cl_page_is_owned);
 
 /**
 EXPORT_SYMBOL(cl_page_is_owned);
 
 /**
- * Owns a page by IO.
+ * Try to own a page by IO.
  *
  * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
  * into cl_page_state::CPS_OWNED state.
  *
  * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it
  * into cl_page_state::CPS_OWNED state.
@@ -902,11 +907,15 @@ EXPORT_SYMBOL(cl_page_is_owned);
  *
  * \retval -ve failure, e.g., page was destroyed (and landed in
  *             cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
  *
  * \retval -ve failure, e.g., page was destroyed (and landed in
  *             cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED).
+ *             or, page was owned by another thread, or in IO.
  *
  * \see cl_page_disown()
  * \see cl_page_operations::cpo_own()
  *
  * \see cl_page_disown()
  * \see cl_page_operations::cpo_own()
+ * \see cl_page_own_try()
+ * \see cl_page_own
  */
  */
-int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+static int cl_page_own0(const struct lu_env *env, struct cl_io *io,
+                        struct cl_page *pg, int nonblock)
 {
         int result;
 
 {
         int result;
 
@@ -919,26 +928,54 @@ int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
         if (pg->cp_state == CPS_FREEING) {
                 result = -EAGAIN;
         } else {
         if (pg->cp_state == CPS_FREEING) {
                 result = -EAGAIN;
         } else {
-                cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_own));
-                PASSERT(env, pg, pg->cp_owner == NULL);
-                PASSERT(env, pg, pg->cp_req == NULL);
-                pg->cp_owner = io;
-                pg->cp_task  = current;
-                cl_page_owner_set(pg);
-                if (pg->cp_state != CPS_FREEING) {
-                        cl_page_state_set(env, pg, CPS_OWNED);
-                        result = 0;
-                } else {
-                        cl_page_disown0(env, io, pg);
-                        result = -EAGAIN;
+                result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(cpo_own),
+                                        (const struct lu_env *,
+                                         const struct cl_page_slice *,
+                                         struct cl_io *, int),
+                                        io, nonblock);
+                if (result == 0) {
+                        PASSERT(env, pg, pg->cp_owner == NULL);
+                        PASSERT(env, pg, pg->cp_req == NULL);
+                        pg->cp_owner = io;
+                        pg->cp_task  = current;
+                        cl_page_owner_set(pg);
+                        if (pg->cp_state != CPS_FREEING) {
+                                cl_page_state_set(env, pg, CPS_OWNED);
+                        } else {
+                                cl_page_disown0(env, io, pg);
+                                result = -EAGAIN;
+                        }
                 }
         }
         PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
         RETURN(result);
 }
                 }
         }
         PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg)));
         RETURN(result);
 }
+
+/**
+ * Own a page, might be blocked.
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg)
+{
+        return cl_page_own0(env, io, pg, 0);
+}
 EXPORT_SYMBOL(cl_page_own);
 
 /**
 EXPORT_SYMBOL(cl_page_own);
 
 /**
+ * Nonblock version of cl_page_own().
+ *
+ * \see cl_page_own0()
+ */
+int cl_page_own_try(const struct lu_env *env, struct cl_io *io,
+                    struct cl_page *pg)
+{
+        return cl_page_own0(env, io, pg, 1);
+}
+EXPORT_SYMBOL(cl_page_own_try);
+
+
+/**
  * Assume page ownership.
  *
  * Called when page is already locked by the hosting VM.
  * Assume page ownership.
  *
  * Called when page is already locked by the hosting VM.
@@ -1408,7 +1445,7 @@ int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj)
         }
 
         cl_page_list_init(plist);
         }
 
         cl_page_list_init(plist);
-        cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, plist);
+        cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, plist, 0);
         /*
          * Since we're purging the pages of an object, we don't care
          * the possible outcomes of the following functions.
         /*
          * Since we're purging the pages of an object, we don't care
          * the possible outcomes of the following functions.
index 5cb13ee..04b11db 100644 (file)
@@ -409,7 +409,7 @@ static void osc_trunc_check(const struct lu_env *env, struct cl_io *io,
          * XXX this is quite expensive check.
          */
         cl_page_list_init(list);
          * XXX this is quite expensive check.
          */
         cl_page_list_init(list);
-        cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, list);
+        cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, list, 0);
 
         cl_page_list_for_each(page, list)
                 CL_PAGE_DEBUG(D_ERROR, env, page, "exists %lu\n", start);
 
         cl_page_list_for_each(page, list)
                 CL_PAGE_DEBUG(D_ERROR, env, page, "exists %lu\n", start);
index d14acdc..5352983 100644 (file)
@@ -1480,7 +1480,7 @@ static int osc_lock_has_pages(struct osc_lock *olck)
                 io->ci_obj = cl_object_top(obj);
                 cl_io_init(env, io, CIT_MISC, io->ci_obj);
                 cl_page_gang_lookup(env, obj, io,
                 io->ci_obj = cl_object_top(obj);
                 cl_io_init(env, io, CIT_MISC, io->ci_obj);
                 cl_page_gang_lookup(env, obj, io,
-                                    descr->cld_start, descr->cld_end, plist);
+                                    descr->cld_start, descr->cld_end, plist, 0);
                 cl_lock_page_list_fixup(env, io, lock, plist);
                 if (plist->pl_nr > 0) {
                         CL_LOCK_DEBUG(D_ERROR, env, lock, "still has pages\n");
                 cl_lock_page_list_fixup(env, io, lock, plist);
                 if (plist->pl_nr > 0) {
                         CL_LOCK_DEBUG(D_ERROR, env, lock, "still has pages\n");
index 71e8402..f8fd889 100644 (file)
@@ -2347,21 +2347,28 @@ osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
         struct osc_brw_async_args *aa;
         const struct obd_async_page_ops *ops;
         CFS_LIST_HEAD(rpc_list);
         struct osc_brw_async_args *aa;
         const struct obd_async_page_ops *ops;
         CFS_LIST_HEAD(rpc_list);
+        CFS_LIST_HEAD(tmp_list);
         unsigned int ending_offset;
         unsigned  starting_offset = 0;
         int srvlock = 0;
         struct cl_object *clob = NULL;
         ENTRY;
 
         unsigned int ending_offset;
         unsigned  starting_offset = 0;
         int srvlock = 0;
         struct cl_object *clob = NULL;
         ENTRY;
 
-        /* If there are HP OAPs we need to handle at least 1 of them,
-         * move it the beginning of the pending list for that. */
-        if (!list_empty(&lop->lop_urgent)) {
-                oap = list_entry(lop->lop_urgent.next,
-                                 struct osc_async_page, oap_urgent_item);
-                if (oap->oap_async_flags & ASYNC_HP)
-                        list_move(&oap->oap_pending_item, &lop->lop_pending);
+        /* ASYNC_HP pages first. At present, when the lock the pages is
+         * to be canceled, the pages covered by the lock will be sent out
+         * with ASYNC_HP. We have to send out them as soon as possible. */
+        list_for_each_entry_safe(oap, tmp, &lop->lop_urgent, oap_urgent_item) {
+                if (oap->oap_async_flags & ASYNC_HP) 
+                        list_move(&oap->oap_pending_item, &tmp_list);
+                else
+                        list_move_tail(&oap->oap_pending_item, &tmp_list);
+                if (++page_count >= cli->cl_max_pages_per_rpc)
+                        break;
         }
 
         }
 
+        list_splice(&tmp_list, &lop->lop_pending);
+        page_count = 0;
+
         /* first we find the pages we're allowed to work with */
         list_for_each_entry_safe(oap, tmp, &lop->lop_pending,
                                  oap_pending_item) {
         /* first we find the pages we're allowed to work with */
         list_for_each_entry_safe(oap, tmp, &lop->lop_pending,
                                  oap_pending_item) {
@@ -2384,6 +2391,13 @@ osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                                oap, oap->oap_brw_page.pg, (unsigned)!srvlock);
                         break;
                 }
                                oap, oap->oap_brw_page.pg, (unsigned)!srvlock);
                         break;
                 }
+
+                /* If there is a gap at the start of this page, it can't merge
+                 * with any previous page, so we'll hand the network a
+                 * "fragmented" page array that it can't transfer in 1 RDMA */
+                if (page_count != 0 && oap->oap_page_off != 0)
+                        break;
+
                 /* in llite being 'ready' equates to the page being locked
                  * until completion unlocks it.  commit_write submits a page
                  * as not ready because its unlock will happen unconditionally
                 /* in llite being 'ready' equates to the page being locked
                  * until completion unlocks it.  commit_write submits a page
                  * as not ready because its unlock will happen unconditionally
@@ -2453,11 +2467,6 @@ osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
                         }
                 }
 #endif
                         }
                 }
 #endif
-                /* If there is a gap at the start of this page, it can't merge
-                 * with any previous page, so we'll hand the network a
-                 * "fragmented" page array that it can't transfer in 1 RDMA */
-                if (page_count != 0 && oap->oap_page_off != 0)
-                        break;
 
                 /* take the page out of our book-keeping */
                 list_del_init(&oap->oap_pending_item);
 
                 /* take the page out of our book-keeping */
                 list_del_init(&oap->oap_pending_item);
@@ -2523,7 +2532,7 @@ osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli,
         req = osc_build_req(env, cli, &rpc_list, page_count, cmd);
         if (IS_ERR(req)) {
                 LASSERT(list_empty(&rpc_list));
         req = osc_build_req(env, cli, &rpc_list, page_count, cmd);
         if (IS_ERR(req)) {
                 LASSERT(list_empty(&rpc_list));
-                /* loi_list_maint(cli, loi); */
+                loi_list_maint(cli, loi);
                 RETURN(PTR_ERR(req));
         }
 
                 RETURN(PTR_ERR(req));
         }
 
@@ -2664,8 +2673,28 @@ void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli)
                 if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) {
                         rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_WRITE,
                                               &loi->loi_write_lop);
                 if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) {
                         rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_WRITE,
                                               &loi->loi_write_lop);
-                        if (rc < 0)
-                                break;
+                        if (rc < 0) {
+                                CERROR("Write request failed with %d\n", rc);
+
+                                /* osc_send_oap_rpc failed, mostly because of
+                                 * memory pressure.
+                                 *
+                                 * It can't break here, because if:
+                                 *  - a page was submitted by osc_io_submit, so
+                                 *    page locked;
+                                 *  - no request in flight
+                                 *  - no subsequent request
+                                 * The system will be in live-lock state,
+                                 * because there is no chance to call
+                                 * osc_io_unplug() and osc_check_rpcs() any
+                                 * more. pdflush can't help in this case,
+                                 * because it might be blocked at grabbing
+                                 * the page lock as we mentioned.
+                                 *
+                                 * Anyway, continue to drain pages. */
+                                /* break; */
+                        }
+
                         if (rc > 0)
                                 race_counter = 0;
                         else
                         if (rc > 0)
                                 race_counter = 0;
                         else
@@ -2675,7 +2704,8 @@ void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli)
                         rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_READ,
                                               &loi->loi_read_lop);
                         if (rc < 0)
                         rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_READ,
                                               &loi->loi_read_lop);
                         if (rc < 0)
-                                break;
+                                CERROR("Read request failed with %d\n", rc);
+
                         if (rc > 0)
                                 race_counter = 0;
                         else
                         if (rc > 0)
                                 race_counter = 0;
                         else