From a36ce42408b44a1e62f91213c28450b853347f93 Mon Sep 17 00:00:00 2001 From: shaver Date: Thu, 5 Dec 2002 00:31:32 +0000 Subject: [PATCH] Landing b_recovery. b=441: garbage on read from stripes with failed OSTs. b=438: mark OSCs as active before reconnecting during recovery. b=403: lov_enqueue and lov_cancel need to handle inactive OSTs --- lustre/llite/rw.c | 7 ++--- lustre/lov/lov_obd.c | 70 ++++++++++++++++++++++++++++++++++++++---------- lustre/osc/osc_request.c | 5 +++- lustre/tests/lov.xml | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 134 insertions(+), 18 deletions(-) create mode 100644 lustre/tests/lov.xml diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index d0db956..095c145 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -111,9 +111,10 @@ static int ll_brw(int cmd, struct inode *inode, struct page *page, int create) set->brw_callback = ll_brw_sync_wait; rc = obd_brw(cmd, ll_i2obdconn(inode), lsm, 1, &pg, set); - if (rc) - CERROR("error from obd_brw: rc = %d\n", rc); - else { + if (rc) { + if (rc != -EIO) + CERROR("error from obd_brw: rc = %d\n", rc); + } else { rc = ll_brw_sync_wait(set, CB_PHASE_START); if (rc) CERROR("error from callback: rc = %d\n", rc); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 5ab02b1..1b92409 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -1112,6 +1112,8 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo; i < stripe_count; i++, loi++, si_last = si, si++) { + if (lov->tgts[loi->loi_ost_idx].active == 0) + GOTO(out_ioarr, rc = -EIO); if (i > 0) si->index = si_last->index + si_last->bufct; si->lsm.lsm_object_id = loi->loi_id; @@ -1134,12 +1136,14 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, if (si->bufct) { LASSERT(shift < oa_bufs); - /* XXX handle error returns here */ - obd_brw(cmd, &lov->tgts[si->ost_idx].conn, - &si->lsm, si->bufct, &ioarr[shift], set); + rc = obd_brw(cmd, &lov->tgts[si->ost_idx].conn, + &si->lsm, si->bufct, &ioarr[shift], set); + if (rc) + GOTO(out_ioarr, rc); } } + out_ioarr: OBD_FREE(ioarr, sizeof(*ioarr) * oa_bufs); out_where: OBD_FREE(where, sizeof(*where) * oa_bufs); @@ -1158,6 +1162,7 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; struct lov_oinfo *loi; + struct lov_stripe_md submd; int rc = 0, i; ENTRY; @@ -1172,16 +1177,22 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, RETURN(-EINVAL); } - /* XXX assert that we're not in recovery */ + /* we should never be asked to replay a lock. */ + + LASSERT((*flags & LDLM_FL_REPLAY) == 0); if (!export || !export->exp_obd) RETURN(-ENODEV); + memset(lockhs, 0, sizeof(*lockhs) * lsm->lsm_stripe_count); + lov = &export->exp_obd->u.lov; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { struct ldlm_extent *extent = (struct ldlm_extent *)cookie; struct ldlm_extent sub_ext; - struct lov_stripe_md submd; + + if (lov->tgts[loi->loi_ost_idx].active == 0) + continue; *flags = 0; sub_ext.start = lov_stripe_offset(lsm, extent->start, i); @@ -1200,11 +1211,31 @@ static int lov_enqueue(struct lustre_handle *conn, struct lov_stripe_md *lsm, parent_lock, type, &sub_ext, sizeof(sub_ext), mode, flags, cb, data, datalen, &(lockhs[i])); // XXX add a lock debug statement here - if (rc) { + if (rc && lov->tgts[loi->loi_ost_idx].active) { CERROR("Error enqueue objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, loi->loi_id, loi->loi_ost_idx, rc); - memset(&(lockhs[i]), 0, sizeof(lockhs[i])); + goto out_locks; + } + } + + RETURN(0); + + out_locks: + for (i--, loi = &lsm->lsm_oinfo[i]; i >= 0; i--, loi--) { + int err; + + if (lov->tgts[loi->loi_ost_idx].active == 0) + continue; + + submd.lsm_object_id = loi->loi_id; + submd.lsm_stripe_count = 0; + err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, + mode, &lockhs[i]); + if (err) { + CERROR("Error cancelling objid "LPX64" subobj "LPX64 + " on OST idx %d after enqueue error: rc = %d\n", + loi->loi_id, loi->loi_ost_idx, err); } } RETURN(rc); @@ -1236,18 +1267,25 @@ static int lov_cancel(struct lustre_handle *conn, struct lov_stripe_md *lsm, lov = &export->exp_obd->u.lov; for (i = 0,loi = lsm->lsm_oinfo; i < lsm->lsm_stripe_count; i++,loi++) { struct lov_stripe_md submd; + int err; + + if (lov->tgts[loi->loi_ost_idx].active == 0) + continue; if (lockhs[i].addr == 0) continue; submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; - rc = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, + err = obd_cancel(&lov->tgts[loi->loi_ost_idx].conn, &submd, mode, &lockhs[i]); - if (rc) + if (err && lov->tgts[loi->loi_ost_idx].active) { CERROR("Error cancel objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, - loi->loi_id, loi->loi_ost_idx, rc); + loi->loi_id, loi->loi_ost_idx, err); + if (!rc) + rc = err; + } } RETURN(rc); } @@ -1258,7 +1296,7 @@ static int lov_cancel_unused(struct lustre_handle *conn, struct obd_export *export = class_conn2export(conn); struct lov_obd *lov; struct lov_oinfo *loi; - int rc = 0, i; + int rc = 0, i, err; ENTRY; if (!lsm) { @@ -1275,13 +1313,17 @@ static int lov_cancel_unused(struct lustre_handle *conn, submd.lsm_object_id = loi->loi_id; submd.lsm_stripe_count = 0; - rc = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn, + err = obd_cancel_unused(&lov->tgts[loi->loi_ost_idx].conn, &submd, flags); - if (rc) + if (err && lov->tgts[loi->loi_ost_idx].active) { CERROR("Error cancel unused objid "LPX64" subobj "LPX64 " on OST idx %d: rc = %d\n", lsm->lsm_object_id, - loi->loi_id, loi->loi_ost_idx, rc); + loi->loi_id, loi->loi_ost_idx, err); + if (!rc) + rc = err; + } } + RETURN(rc); } diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 52703c8..a5302ef 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -963,9 +963,12 @@ static int osc_recover(struct obd_import *imp, int phase) RETURN(0); } case PTLRPC_RECOVD_PHASE_RECOVER: + imp->imp_flags &= ~IMP_INVALID; rc = ptlrpc_reconnect_import(imp, OST_CONNECT); - if (rc) + if (rc) { + imp->imp_flags |= IMP_INVALID; RETURN(rc); + } set_osc_active(imp, 1 /* active */); RETURN(0); default: diff --git a/lustre/tests/lov.xml b/lustre/tests/lov.xml new file mode 100644 index 0000000..532c1ec2 --- /dev/null +++ b/lustre/tests/lov.xml @@ -0,0 +1,70 @@ + + + + + + + + + + + + + + + + + + localhost + 988 + + + + extN + /tmp/mds1 + yes + + + + + + + + + + + + + + + extN + /tmp/ost1 + no + + + + + + + + + + + extN + /tmp/ost2 + no + + + + + + + + + + + + + /mnt/lustre + + -- 1.8.3.1