fix panic with double free request if network error.

author shadow <shadow>

Wed, 26 Mar 2008 06:22:07 +0000 (06:22 +0000)

committer shadow <shadow>

Wed, 26 Mar 2008 06:22:07 +0000 (06:22 +0000)
author shadow <shadow>
Wed, 26 Mar 2008 06:22:07 +0000 (06:22 +0000)
committer shadow <shadow>
Wed, 26 Mar 2008 06:22:07 +0000 (06:22 +0000)
diff --git a/lustre/ChangeLog b/lustre/ChangeLog

index 6f06c32..7e0f3b8 100644 (file)
--- a/lustre/ChangeLog
+++ b/lustre/ChangeLog
@@ -18,6 +18,15 @@ tbd  Sun Microsystems, Inc.
         * RHEL 4 and RHEL 5/SLES 10 clients behaves differently on 'cd' to a
          removed cwd "./" (refer to Bugzilla 14399).
  
+Severity   : major
+Bugzilla   : 15027
+Frequency  : on network error
+Description: panic with double free request if network error 
+Details    : mdc_finish_enqueue is finish request if any network error occuring,
+             but it's true only for synchronus enqueue, for async enqueue 
+             (via ptlrpcd) this incorrect and ptlrpcd want finish request
+             himself.
+
  Severity   : normal
  Bugzilla   : 14533
  Frequency  : rare, on recovery
diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h

index b42a71e..6dcb0d9 100644 (file)
--- a/lustre/include/obd_support.h
+++ b/lustre/include/obd_support.h
@@ -257,6 +257,7 @@ extern unsigned int obd_alloc_fail_rate;
  #define OBD_FAIL_MDC_REVALIDATE_PAUSE    0x800
  #define OBD_FAIL_MDC_ENQUEUE_PAUSE       0x801
  #define OBD_FAIL_MDC_OLD_EXT_FLAGS       0x802
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
  
  #define OBD_FAIL_MGS                     0x900
  #define OBD_FAIL_MGS_ALL_REQUEST_NET     0x901
diff --git a/lustre/mdc/mdc_lib.c b/lustre/mdc/mdc_lib.c

index c66de79..bb02005 100644 (file)
--- a/lustre/mdc/mdc_lib.c
+++ b/lustre/mdc/mdc_lib.c
@@ -443,7 +443,7 @@ void mdc_exit_request(struct client_obd *cli)
  
          spin_lock(&cli->cl_loi_list_lock);
          cli->cl_r_in_flight--;
-                
+
          list_for_each_safe(l, tmp, &cli->cl_cache_waiters) {
                  if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) {
                          /* No free request slots anymore */
@@ -456,6 +456,6 @@ void mdc_exit_request(struct client_obd *cli)
                  wake_up(&mcw->mcw_waitq);
          }
          /* Empty waiting list? Decrease reqs in-flight number */
-        
+
          spin_unlock(&cli->cl_loi_list_lock);
  }
diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c

index 44af012..85c9580 100644 (file)
--- a/lustre/mdc/mdc_locks.c
+++ b/lustre/mdc/mdc_locks.c
@@ -406,6 +406,7 @@ static int mdc_finish_enqueue(struct obd_export *exp,
          struct ldlm_reply *lockrep;
          ENTRY;
  
+        LASSERT(rc >= 0);
          /* Similarly, if we're going to replay this request, we don't want to
           * actually get a lock, just perform the intent. */
          if (req->rq_transno || req->rq_replay) {
@@ -418,12 +419,6 @@ static int mdc_finish_enqueue(struct obd_export *exp,
                  einfo->ei_mode = 0;
                  memset(lockh, 0, sizeof(*lockh));
                  rc = 0;
-        } else if (rc != 0) {
-                CERROR("ldlm_cli_enqueue: %d\n", rc);
-                LASSERTF(rc < 0, "rc %d\n", rc);
-                mdc_clear_replay_flag(req, rc);
-                ptlrpc_req_finished(req);
-                RETURN(rc);
          } else { /* rc = 0 */
                  struct ldlm_lock *lock = ldlm_handle2lock(lockh);
                  LASSERT(lock);
@@ -587,7 +582,12 @@ int mdc_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo,
                                0, NULL, lockh, 0);
          mdc_exit_request(&obddev->u.cli);
          mdc_put_rpc_lock(obddev->u.cli.cl_rpc_lock, it);
-
+        if (rc < 0) {
+                CERROR("ldlm_cli_enqueue: %d\n", rc);
+                mdc_clear_replay_flag(req, rc);
+                ptlrpc_req_finished(req);
+                RETURN(rc);
+        }
          rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
  
          RETURN(rc);
@@ -832,9 +832,16 @@ static int mdc_intent_getattr_async_interpret(struct ptlrpc_request *req,
          obddev = class_exp2obd(exp);
  
          mdc_exit_request(&obddev->u.cli);
+        if (OBD_FAIL_CHECK(OBD_FAIL_MDC_GETATTR_ENQUEUE))
+                rc = -ETIMEDOUT;
  
          rc = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, 1, einfo->ei_mode,
                                     &flags, NULL, 0, NULL, lockh, rc);
+        if (rc < 0) {
+                CERROR("ldlm_cli_enqueue: %d\n", rc);
+                mdc_clear_replay_flag(req, rc);
+                GOTO(out, rc);
+        }
  
          rc = mdc_finish_enqueue(exp, req, einfo, it, lockh, rc);
          if (rc)
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index 7517d15..838c19c 100644 (file)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -4635,8 +4635,7 @@ test_122() { #bug 11544
  }
  run_test 122 "fail client bulk callback (shouldn't LBUG) ======="
  
-test_123() # statahead(bug 11401)
-{
+test_123a() { # was test 123, statahead(bug 11401)
          if [ -z "$(grep "processor.*: 1" /proc/cpuinfo)" ]; then
                  log "testing on UP system. Performance may be not as good as expected."
          fi
@@ -4657,6 +4656,7 @@ test_123() # statahead(bug 11401)
                  etime=`date +%s`
                  delta_sa=$((etime - stime))
                  log "ls $i files with statahead:    $delta_sa sec"
+               lctl get_param -n llite.*.statahead_stats
  
                  max=`lctl get_param -n llite.*.statahead_max | head -n 1`
                  lctl set_param -n llite.*.statahead_max 0
@@ -4692,7 +4692,26 @@ test_123() # statahead(bug 11401)
          [ $error -ne 0 ] && error "statahead is slow!"
          return 0
  }
-run_test 123 "verify statahead work"
+run_testa 123 "verify statahead work"
+
+test_123b () { # statahead(bug 15027)
+       mkdir -p $DIR/$tdir
+       createmany -o $DIR/$tdir/$tfile-%d 1000
+       
+        cancel_lru_locks mdc
+        cancel_lru_locks osc
+
+#define OBD_FAIL_MDC_GETATTR_ENQUEUE     0x803
+        sysctl -w lustre.fail_loc=0x80000803
+        ls -lR $DIR/$tdir > /dev/null
+        log "ls done"
+        sysctl -w lustre.fail_loc=0x0
+        lctl get_param -n llite.*.statahead_stats
+        rm -r $DIR/$tdir
+        sync
+
+}
+run_test 123b "not panic with network error in statahead enqueue (bug 15027)"
  
  test_124a() {
         [ -z "`lctl get_param -n mdc.*.connect_flags | grep lru_resize`" ] && \
author	shadow <shadow>
	Wed, 26 Mar 2008 06:22:07 +0000 (06:22 +0000)
committer	shadow <shadow>
	Wed, 26 Mar 2008 06:22:07 +0000 (06:22 +0000)
lustre/ChangeLog		patch \| blob \| history
lustre/include/obd_support.h		patch \| blob \| history
lustre/mdc/mdc_lib.c		patch \| blob \| history
lustre/mdc/mdc_locks.c		patch \| blob \| history
lustre/tests/sanity.sh		patch \| blob \| history