From 91cda45a19f16ccfba3da37fa4a7e03f4baf6c8b Mon Sep 17 00:00:00 2001 From: shaver Date: Mon, 16 Dec 2002 02:04:25 +0000 Subject: [PATCH] Land b_recovery (other than WIP lock-replay work). - b=512,others: fully reactivate OST imports after reconnection. - b=513(partial): make sure client sees out -ENOTCONN from mds_handle - b=515: more graceful error handling for truncating on dead OST - b=474: don't error out just because a file is striped across a dead OST; only if we actually try to access it - b=410: fix garbage sizes when stat(2)ing a file that includes a stripe on a dead OST - print console diagnostic for completion-timeout client recovery - MSG_RESENT for requests that were retransmitted --- lustre/ChangeLog | 5 +++++ lustre/include/linux/lustre_idl.h | 8 +++++++- lustre/ldlm/ldlm_request.c | 6 +++++- lustre/llite/file.c | 4 +--- lustre/llite/rw.c | 11 ++++------- lustre/lov/lov_obd.c | 34 ++++++++++++++++++++-------------- lustre/mds/handler.c | 4 +++- lustre/osc/osc_request.c | 7 +++++++ lustre/ptlrpc/client.c | 3 ++- 9 files changed, 54 insertions(+), 28 deletions(-) diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8495e29..8da1889 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,5 +1,10 @@ TBA * bug fixes + - Fully reactivate OST imports after reconnection (512, others) + - Make sure client sees our -ENOTCONN from mds_handle (513 - partial) + - More graceful error handling for truncating on dead OST (515) + - Don't error out unless we're actually accessing dead stripes (474) + - Fix garbage sizes when stripes are missing (410) - LRU counters were broken, causing constant lock purge (433, 432) - garbage on read from stripes with failed OSTs (441) - mark OSCs as active before reconnecting during recovery (438) diff --git a/lustre/include/linux/lustre_idl.h b/lustre/include/linux/lustre_idl.h index 8d6536f..0febd11 100644 --- a/lustre/include/linux/lustre_idl.h +++ b/lustre/include/linux/lustre_idl.h @@ -128,16 +128,22 @@ struct lustre_msg { /* Flags that apply to all requests are in the bottom 16 bits */ #define MSG_GEN_FLAG_MASK 0x0000ffff #define MSG_LAST_REPLAY 1 +#define MSG_RESENT 2 static inline int lustre_msg_get_flags(struct lustre_msg *msg) { return (msg->flags & MSG_GEN_FLAG_MASK); } +static inline void lustre_msg_add_flags(struct lustre_msg *msg, int flags) +{ + msg->flags |= MSG_GEN_FLAG_MASK & flags; +} + static inline void lustre_msg_set_flags(struct lustre_msg *msg, int flags) { msg->flags &= ~MSG_GEN_FLAG_MASK; - msg->flags |= MSG_GEN_FLAG_MASK & flags; + lustre_msg_add_flags(msg, flags); } static inline int lustre_msg_get_op_flags(struct lustre_msg *msg) diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 7d449ef..7a972b9 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -44,8 +44,12 @@ static int expired_completion_wait(void *data) CERROR("lock %p has NULL obd\n", lock); else if (!(conn = obd->u.cli.cl_import.imp_connection)) CERROR("lock %p has NULL connection\n", lock); - else + else { + LDLM_DEBUG(lock, "timed out waiting for completion"); + CERROR("lock %p timed out from %s\n", lock, + conn->c_remote_uuid); class_signal_connection_failure(conn); + } RETURN(0); } diff --git a/lustre/llite/file.c b/lustre/llite/file.c index b265ffc..87c9012 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -158,7 +158,6 @@ out_mdc: mdc_close(&sbi->ll_mdc_conn, inode->i_ino, S_IFREG, &fd->fd_mdshandle, &req); out_req: - ptlrpc_req_finished(req); /* once for reply */ ptlrpc_req_finished(req); /* once for an early "commit" */ //out_fd: fd->fd_mdshandle.cookie = DEAD_HANDLE_MAGIC; @@ -249,6 +248,7 @@ int ll_file_size(struct inode *inode, struct lov_stripe_md *lsm) RETURN(rc); } + memset(&oa, 0, sizeof oa); oa.o_id = lsm->lsm_object_id; oa.o_mode = S_IFREG; oa.o_valid = OBD_MD_FLID|OBD_MD_FLTYPE|OBD_MD_FLSIZE|OBD_MD_FLBLOCKS; @@ -796,8 +796,6 @@ int ll_fsync(struct file *file, struct dentry *dentry, int data) return 0; } - - static int ll_inode_revalidate(struct dentry *dentry) { struct inode *inode = dentry->d_inode; diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index c572590..7f486fb 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -180,18 +180,15 @@ void ll_truncate(struct inode *inode) err = ll_size_lock(inode, lsm, inode->i_size, LCK_PW, &lockhs); if (err) { CERROR("ll_size_lock failed: %d\n", err); - /* FIXME: What to do here? It's too late to back out... */ - LBUG(); + return; } /* truncate == punch from new size to absolute end of file */ err = obd_punch(ll_i2obdconn(inode), &oa, lsm, inode->i_size, OBD_OBJECT_EOF); - if (err) { - LBUG(); - CERROR("obd_truncate fails (%d) ino %lu\n", err, - inode->i_ino); - } else + if (err) + CERROR("obd_truncate fails (%d) ino %lu\n", err, inode->i_ino); + else obdo_to_inode(inode, &oa, oa.o_valid); err = ll_size_unlock(inode, lsm, LCK_PW, lockhs); diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index e7cf3cb..d2dc23c 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -6,6 +6,7 @@ * Copyright (C) 2002 Cluster File Systems, Inc. * Author: Phil Schwan * Peter Braam + * Mike Shaver * * This code is issued under the GNU General Public License. * See the file COPYING in this distribution @@ -700,13 +701,17 @@ static int lov_getattr(struct lustre_handle *conn, struct obdo *oa, tmp.o_valid &= ~OBD_MD_FLHANDLE; err = obd_getattr(&lov->tgts[loi->loi_ost_idx].conn, &tmp,NULL); - if (err && lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error getattr objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", - oa->o_id, loi->loi_id, loi->loi_ost_idx, err); - RETURN(err); + if (err) { + if (lov->tgts[loi->loi_ost_idx].active) { + CERROR("Error getattr objid "LPX64" subobj " + LPX64" on OST idx %d: rc = %d\n", + oa->o_id, loi->loi_id, loi->loi_ost_idx, + err); + RETURN(err); + } + } else { + lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &new); } - lov_merge_attrs(oa, &tmp, tmp.o_valid, lsm, i, &new); } RETURN(0); @@ -832,12 +837,15 @@ static int lov_open(struct lustre_handle *conn, struct obdo *oa, tmp->o_id = loi->loi_id; rc = obd_open(&lov->tgts[loi->loi_ost_idx].conn, tmp, NULL); - if (rc && lov->tgts[loi->loi_ost_idx].active) { - CERROR("Error open objid "LPX64" subobj "LPX64 - " on OST idx %d: rc = %d\n", - oa->o_id, lsm->lsm_oinfo[i].loi_id, - loi->loi_ost_idx, rc); - goto out_handles; + if (rc) { + if (lov->tgts[loi->loi_ost_idx].active) { + CERROR("Error open objid "LPX64" subobj "LPX64 + " on OST idx %d: rc = %d\n", + oa->o_id, lsm->lsm_oinfo[i].loi_id, + loi->loi_ost_idx, rc); + goto out_handles; + } + continue; } lov_merge_attrs(oa, tmp, tmp->o_valid, lsm, i, &new); @@ -1112,8 +1120,6 @@ static inline int lov_brw(int cmd, struct lustre_handle *conn, for (i = 0, loi = lsm->lsm_oinfo, si_last = si = stripeinfo; i < stripe_count; i++, loi++, si_last = si, si++) { - if (lov->tgts[loi->loi_ost_idx].active == 0) - GOTO(out_ioarr, rc = -EIO); if (i > 0) si->index = si_last->index + si_last->bufct; si->lsm.lsm_object_id = loi->loi_id; diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 8eab05f..dea3f57 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -1262,8 +1262,10 @@ int mds_handle(struct ptlrpc_request *req) LASSERT(!strcmp(req->rq_obd->obd_type->typ_name, LUSTRE_MDT_NAME)); if (req->rq_reqmsg->opc != MDS_CONNECT) { - if (req->rq_export == NULL) + if (req->rq_export == NULL) { + req->rq_status = -ENOTCONN; GOTO(out, rc = -ENOTCONN); + } mds = mds_req2mds(req); if (mds->mds_recoverable_clients != 0) { diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index 2412435..30aa36d 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -965,6 +965,7 @@ static int osc_recover(struct obd_import *imp, int phase) set_osc_active(imp, 0 /* inactive */); RETURN(0); } + case PTLRPC_RECOVD_PHASE_RECOVER: imp->imp_flags &= ~IMP_INVALID; rc = ptlrpc_reconnect_import(imp, OST_CONNECT); @@ -972,8 +973,14 @@ static int osc_recover(struct obd_import *imp, int phase) imp->imp_flags |= IMP_INVALID; RETURN(rc); } + + spin_lock(&imp->imp_lock); + imp->imp_level = LUSTRE_CONN_FULL; + spin_unlock(&imp->imp_lock); + set_osc_active(imp, 1 /* active */); RETURN(0); + default: RETURN(-EINVAL); } diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index 069fd2a..09b0662 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -412,7 +412,7 @@ static int ptlrpc_check_status(struct ptlrpc_request *req) err = req->rq_repmsg->status; if (req->rq_repmsg->type == NTOH__u32(PTL_RPC_MSG_ERR)) { - DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR"); + DEBUG_REQ(D_ERROR, req, "type == PTL_RPC_MSG_ERR (%d)\n", err); RETURN(err ? err : -EINVAL); } @@ -682,6 +682,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) if ((req->rq_flags & (PTL_RPC_FL_RESEND | PTL_RPC_FL_INTR)) == PTL_RPC_FL_RESEND) { req->rq_flags &= ~PTL_RPC_FL_RESEND; + lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); DEBUG_REQ(D_HA, req, "resending: "); goto resend; } -- 1.8.3.1