Whamcloud - gitweb
Landing b_recovery.
authorshaver <shaver>
Thu, 5 Dec 2002 00:31:32 +0000 (00:31 +0000)
committershaver <shaver>
Thu, 5 Dec 2002 00:31:32 +0000 (00:31 +0000)
b=441: garbage on read from stripes with failed OSTs.
b=438: mark OSCs as active before reconnecting during recovery.
b=403: lov_enqueue and lov_cancel need to handle inactive OSTs

lustre/llite/rw.c
lustre/lov/lov_obd.c
lustre/osc/osc_request.c
lustre/tests/lov.xml [new file with mode: 0644]

index d0db956..095c145 100644 (file)
@@ -111,9 +111,10 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int create)
 
         set->brw_callback = ll_brw_sync_wait;
         rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, set);
-        if (rc)
-                CERROR("error from obd_brw: rc = %d\n", rc);
-        else {
+        if (rc) {
+                if (rc != -EIO)
+                        CERROR("error from obd_brw: rc = %d\n", rc);
+        } else {
                 rc = ll_brw_sync_wait(set, CB_PHASE_START);
                 if (rc)
                         CERROR("error from callback: rc = %d\n", rc);
index 5ab02b1..1b92409 100644 (file)
@@ -1112,6 +1112,8 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn,
 
         for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo;
              i < stripe_count; i++, loi++, si_last = si, si++) {
+                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                        GOTO(out_ioarr, rc = -EIO);
                 if (i > 0)
                         si->index = si_last->index + si_last->bufct;
                 si->lsm.lsm_object_id = loi->loi_id;
@@ -1134,12 +1136,14 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn,
 
                 if (si->bufct) {
                         LASSERT(shift < oa_bufs);
-                        /* XXX handle error returns here */
-                        obd_brw(cmd, &lov->tgts[si->ost_idx].conn,
-                                &si->lsm, si->bufct, &ioarr[shift], set);
+                        rc = obd_brw(cmd, &lov->tgts[si->ost_idx].conn,
+                                     &si->lsm, si->bufct, &ioarr[shift], set);
+                        if (rc)
+                                GOTO(out_ioarr, rc);
                 }
         }
 
+ out_ioarr:
         OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs);
  out_where:
         OBD_FREE(where, sizeof(*where) * oa_bufs);
@@ -1158,6 +1162,7 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         struct obd_export *export = class_conn2export(conn);
         struct lov_obd *lov;
         struct lov_oinfo *loi;
+        struct lov_stripe_md submd;
         int rc = 0, i;
         ENTRY;
 
@@ -1172,16 +1177,22 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                 RETURN(-EINVAL);
         }
 
-        /* XXX assert that we're not in recovery */
+        /* we should never be asked to replay a lock. */
+
+        LASSERT((*flags & LDLM_FL_REPLAY) == 0);
 
         if (!export || !export->exp_obd)
                 RETURN(-ENODEV);
 
+        memset(lockhs, 0, sizeof(*lockhs) * lsm->lsm_stripe_count);
+
         lov = &export->exp_obd->u.lov;
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 struct ldlm_extent *extent = (struct ldlm_extent *)cookie;
                 struct ldlm_extent sub_ext;
-                struct lov_stripe_md submd;
+
+                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                        continue;
 
                 *flags = 0;
                 sub_ext.start = lov_stripe_offset(lsm, extent->start, i);
@@ -1200,11 +1211,31 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm,
                                  parent_lock, type, &sub_ext, sizeof(sub_ext),
                                  mode, flags, cb, data, datalen, &(lockhs[i]));
                 // XXX add a lock debug statement here
-                if (rc) {
+                if (rc && lov->tgts[loi->loi_ost_idx].active) {
                         CERROR("Error enqueue objid "LPX64" subobj "LPX64
                                " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
                                loi->loi_id, loi->loi_ost_idx, rc);
-                        memset(&(lockhs[i]), 0, sizeof(lockhs[i]));
+                        goto out_locks;
+                }
+        }
+
+        RETURN(0);
+
+ out_locks:
+        for (i--, loi = &lsm->lsm_oinfo[i]; i >= 0; i--, loi--) {
+                int err;
+                
+                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                        continue;
+
+                submd.lsm_object_id = loi->loi_id;
+                submd.lsm_stripe_count = 0;
+                err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
+                                 mode, &lockhs[i]);
+                if (err) {
+                        CERROR("Error cancelling objid "LPX64" subobj "LPX64
+                               " on OST idx %d after enqueue error: rc = %d\n",
+                               loi->loi_id, loi->loi_ost_idx, err);
                 }
         }
         RETURN(rc);
@@ -1236,18 +1267,25 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm,
         lov = &export->exp_obd->u.lov;
         for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) {
                 struct lov_stripe_md submd;
+                int err;
+
+                if (lov->tgts[loi->loi_ost_idx].active == 0)
+                        continue;
 
                 if (lockhs[i].addr == 0)
                         continue;
 
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
-                rc = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
+                err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd,
                                 mode, &lockhs[i]);
-                if (rc)
+                if (err && lov->tgts[loi->loi_ost_idx].active) {
                         CERROR("Error cancel objid "LPX64" subobj "LPX64
                                " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
-                               loi->loi_id, loi->loi_ost_idx, rc);
+                               loi->loi_id, loi->loi_ost_idx, err);
+                        if (!rc)
+                                rc = err;
+                }
         }
         RETURN(rc);
 }
@@ -1258,7 +1296,7 @@ static int lov_cancel_unused(struct lustre_handle *conn,
         struct obd_export *export = class_conn2export(conn);
         struct lov_obd *lov;
         struct lov_oinfo *loi;
-        int rc = 0, i;
+        int rc = 0, i, err;
         ENTRY;
 
         if (!lsm) {
@@ -1275,13 +1313,17 @@ static int lov_cancel_unused(struct lustre_handle *conn,
 
                 submd.lsm_object_id = loi->loi_id;
                 submd.lsm_stripe_count = 0;
-                rc = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn,
+                err = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn,
                                        &submd, flags);
-                if (rc)
+                if (err && lov->tgts[loi->loi_ost_idx].active) {
                         CERROR("Error cancel unused objid "LPX64" subobj "LPX64
                                " on OST idx %d: rc = %d\n", lsm->lsm_object_id,
-                               loi->loi_id, loi->loi_ost_idx, rc);
+                               loi->loi_id, loi->loi_ost_idx, err);
+                        if (!rc)
+                                rc = err;
+                }
         }
+
         RETURN(rc);
 }
 
index 52703c8..a5302ef 100644 (file)
@@ -963,9 +963,12 @@ static int osc_recover(struct obd_import *imp, int phase)
                 RETURN(0);
             }
             case PTLRPC_RECOVD_PHASE_RECOVER:
+                imp->imp_flags &= ~IMP_INVALID;
                 rc = ptlrpc_reconnect_import(imp, OST_CONNECT);
-                if (rc)
+                if (rc) {
+                        imp->imp_flags |= IMP_INVALID;
                         RETURN(rc);
+                }
                 set_osc_active(imp, 1 /* active */);
                 RETURN(0);
             default:
diff --git a/lustre/tests/lov.xml b/lustre/tests/lov.xml
new file mode 100644 (file)
index 0000000..532c1ec
--- /dev/null
@@ -0,0 +1,70 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!DOCTYPE lustre>
+<lustre>
+  <ldlm name='ldlm' uuid='ldlm_UUID'/>
+  <node name='localhost' uuid='localhost_UUID'>
+    <profile>
+      <ldlm_ref uuidref='ldlm_UUID'/>
+      <network_ref uuidref='NET_localhost_tcp_UUID'/>
+      <mds_ref uuidref='mds1_UUID'/>
+      <lovconfig_ref uuidref='LVCFG_lov1_UUID'/>
+      <obd_ref uuidref='OBD_localhost_UUID'/>
+      <ost_ref uuidref='OST_localhost_UUID'/>
+      <obd_ref uuidref='OBD_localhost_2_UUID'/>
+      <ost_ref uuidref='OST_localhost_2_UUID'/>
+      <mountpoint_ref uuidref='MNT_localhost_UUID'/>
+    </profile>
+    <network name='NET_localhost_tcp' uuid='NET_localhost_tcp_UUID' type='tcp'>
+      <server>localhost</server>
+      <port>988</port>
+    </network>
+  </node>
+  <mds name='mds1' uuid='mds1_UUID'>
+    <fstype>extN</fstype>
+    <device size='50000'>/tmp/mds1</device>
+    <autoformat>yes</autoformat>
+    <network_ref uuidref='NET_localhost_tcp_UUID'/>
+    <node_ref uuidref='localhost_UUID'/>
+  </mds>
+  <lov name='lov1' uuid='lov1_UUID'>
+    <mds_ref uuidref='mds1_UUID'/>
+    <devices stripecount='0' stripesize='65536' pattern='0'>
+      <osc_ref uuidref='OSC_localhost_UUID'/>
+      <osc_ref uuidref='OSC_localhost_2_UUID'/>
+    </devices>
+  </lov>
+  <lovconfig name='LVCFG_lov1' uuid='LVCFG_lov1_UUID'>
+    <lov_ref uuidref='lov1_UUID'/>
+  </lovconfig>
+  <obd uuid='OBD_localhost_UUID' name='OBD_localhost' type='obdfilter'>
+    <fstype>extN</fstype>
+    <device size='100000'>/tmp/ost1</device>
+    <autoformat>no</autoformat>
+  </obd>
+  <osc name='OSC_localhost' uuid='OSC_localhost_UUID'>
+    <ost_ref uuidref='OST_localhost_UUID'/>
+    <obd_ref uuidref='OBD_localhost_UUID'/>
+  </osc>
+  <ost name='OST_localhost' uuid='OST_localhost_UUID'>
+    <network_ref uuidref='NET_localhost_tcp_UUID'/>
+    <obd_ref uuidref='OBD_localhost_UUID'/>
+  </ost>
+  <obd name='OBD_localhost_2' uuid='OBD_localhost_2_UUID' type='obdfilter'>
+    <fstype>extN</fstype>
+    <device size='100000'>/tmp/ost2</device>
+    <autoformat>no</autoformat>
+  </obd>
+  <osc name='OSC_localhost_2' uuid='OSC_localhost_2_UUID'>
+    <ost_ref uuidref='OST_localhost_2_UUID'/>
+    <obd_ref uuidref='OBD_localhost_2_UUID'/>
+  </osc>
+  <ost name='OST_localhost_2' uuid='OST_localhost_2_UUID'>
+    <network_ref uuidref='NET_localhost_tcp_UUID'/>
+    <obd_ref uuidref='OBD_localhost_2_UUID'/>
+  </ost>
+  <mountpoint name='MNT_localhost' uuid='MNT_localhost_UUID'>
+    <mds_ref uuidref='mds1_UUID'/>
+    <osc_ref uuidref='lov1_UUID'/>
+    <path>/mnt/lustre</path>
+  </mountpoint>
+</lustre>