and marks the connection(s) as invalid, so that future operations fail.
- Make the NEWCONN ioctl clear the CONN_INVALID flag.
- Remove a handful of unused members from ptlrpc_request.
- More informative and uniform req-failure (timeout, recovery, umount -f)
reporting.
- Update runfailure-net a little; more work needed here.
- Add --force / -f to lconf for "umount -f" umounting.
#define OST_MAXREQSIZE (8 * 1024)
#endif
+#define CONN_INVALID 1
+
struct ptlrpc_connection {
struct list_head c_link;
struct lustre_peer c_peer;
struct list_head c_imports;
struct list_head c_exports;
struct list_head c_sb_chain;
+ __u32 c_flags; /* can we indicate INVALID elsewhere? */
};
struct ptlrpc_client {
struct ptlrpc_request {
int rq_type; /* one of PTL_RPC_MSG_* */
struct list_head rq_list;
- struct list_head rq_multi;
struct obd_device *rq_obd;
int rq_status;
int rq_flags;
- __u32 rq_connid;
atomic_t rq_refcount;
int rq_reqlen;
__u64 rq_transno;
__u64 rq_xid;
- char *rq_bulkbuf;
- int rq_bulklen;
-
int rq_level;
- time_t rq_time;
time_t rq_timeout;
// void * rq_reply_handle;
wait_queue_head_t rq_wait_for_rep;
/* incoming reply */
ptl_md_t rq_reply_md;
- ptl_handle_md_t rq_reply_md_h;
+ ptl_handle_md_t rq_reply_md_h; /* we can lose this: set, never read */
ptl_handle_me_t rq_reply_me_h;
/* outgoing req/rep */
ptl_md_t rq_req_md;
- ptl_handle_md_t rq_req_md_h;
struct lustre_peer rq_peer; /* XXX see service.c can this be factored away? */
struct obd_export *rq_export;
}
}
+static inline void invalidate_request_list(struct list_head *req_list)
+{
+ struct list_head *tmp, *n;
+ list_for_each_safe(tmp, n, req_list) {
+ struct ptlrpc_request *req =
+ list_entry(tmp, struct ptlrpc_request, rq_list);
+ CERROR("invalidating req xid %d op %d to %s:%d\n",
+ (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+ req->rq_connection->c_remote_uuid,
+ req->rq_import->imp_client->cli_request_portal);
+ req->rq_flags |= PTL_RPC_FL_ERR;
+ wake_up(&req->rq_wait_for_rep);
+ }
+}
+
+void ll_umount_begin(struct super_block *sb)
+{
+ struct ll_sb_info *sbi = ll_s2sbi(sb);
+ struct list_head *ctmp;
+
+ ENTRY;
+
+ list_for_each(ctmp, &sbi->ll_conn_chain) {
+ struct ptlrpc_connection *conn;
+ conn = list_entry(ctmp, struct ptlrpc_connection, c_sb_chain);
+
+ spin_lock(&conn->c_lock);
+ conn->c_flags |= CONN_INVALID;
+ invalidate_request_list(&conn->c_sending_head);
+ invalidate_request_list(&conn->c_delayed_head);
+ spin_unlock(&conn->c_unlock);
+ }
+
+ EXIT;
+}
+
/* exported operations */
struct super_operations ll_super_operations =
{
clear_inode: ll_clear_inode,
delete_inode: ll_delete_inode,
put_super: ll_put_super,
- statfs: ll_statfs
+ statfs: ll_statfs,
+ umount_begin: ll_umount_begin
};
struct file_system_type lustre_lite_fs_type = {
request->rq_connection = ptlrpc_connection_addref(conn);
INIT_LIST_HEAD(&request->rq_list);
- INIT_LIST_HEAD(&request->rq_multi);
/*
* This will be reduced once when the sender is finished (waiting for
* reply, f.e.), once when the request has been committed and is
}
ptlrpc_put_connection(request->rq_connection);
- list_del(&request->rq_multi);
OBD_FREE(request, sizeof(*request));
EXIT;
}
struct ptlrpc_request *req = data;
ENTRY;
- CERROR("req timeout on connid %d xid %Ld portal %d op %d\n",
- req->rq_connid, (unsigned long long)req->rq_xid,
- req->rq_import->imp_client->cli_request_portal,
- req->rq_reqmsg->opc);
+ CERROR("req xid "LPD64" op %d: timeout on conn to %s:%d\n",
+ (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+ req->rq_connection->c_remote_uuid,
+ req->rq_import->imp_client->cli_request_portal);
req->rq_flags |= PTL_RPC_FL_TIMEOUT;
if (!req->rq_import->imp_connection->c_recovd_data.rd_recovd)
RETURN(1);
/* XXX probably both an import and connection level are needed */
if (req->rq_level > conn->c_level) {
- CERROR("pid %d waiting for recovery (%d > %d) on conn %p(%s)\n",
- current->pid, req->rq_level, conn->c_level, conn,
- conn->c_remote_uuid);
-
spin_lock(&conn->c_lock);
+ if (conn->c_flags & CONN_INVALID) {
+ /* being torn down by "umount -f" */
+ CERROR("req xid "LPD64" op %d to %s:%d: CONN_INVALID\n",
+ (unsigned long long)req->rq_xid,
+ req->rq_reqmsg->opc,
+ req->rq_connection->c_remote_uuid,
+ req->rq_import->imp_client->cli_request_portal);
+ spin_unlock(&conn->c_lock);
+ RETURN(-EIO);
+ }
list_del(&req->rq_list);
list_add_tail(&req->rq_list, &conn->c_delayed_head);
spin_unlock(&conn->c_lock);
+ CERROR("req xid "LPD64" op %d to %s:%d: waiting for recovery "
+ "(%d < %d)\n",
+ (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+ req->rq_connection->c_remote_uuid,
+ req->rq_import->imp_client->cli_request_portal,
+ req->rq_level, conn->c_level);
+
lwi = LWI_INTR(NULL, NULL);
rc = l_wait_event(req->rq_wait_for_rep,
- req->rq_level <= conn->c_level, &lwi);
+ (req->rq_level <= conn->c_level) ||
+ (req->rq_flags & PTL_RPC_FL_ERR), &lwi);
spin_lock(&conn->c_lock);
list_del_init(&req->rq_list);
spin_unlock(&conn->c_lock);
+ if (req->rq_flags & PTL_RPC_FL_ERR)
+ RETURN(-EIO);
+
if (rc)
RETURN(rc);
-
+
CERROR("process %d resumed\n", current->pid);
}
resend:
- req->rq_time = CURRENT_TIME;
req->rq_timeout = obd_timeout;
spin_lock(&conn->c_lock);
+ if (conn->c_flags & CONN_INVALID) {
+ CERROR("req xid "LPD64" op %d to %s:%d: CONN_INVALID\n",
+ (unsigned long long)req->rq_xid, req->rq_reqmsg->opc,
+ req->rq_connection->c_remote_uuid,
+ req->rq_import->imp_client->cli_request_portal);
+ spin_unlock(&conn->c_lock); /* being torn down by "umount -f" */
+ RETURN(-EIO);
+ }
+
list_del(&req->rq_list);
list_add_tail(&req->rq_list, &conn->c_sending_head);
spin_unlock(&conn->c_lock);
req->rq_xid, req->rq_reqmsg->opc, req->rq_level,
req->rq_connection->c_level);
- req->rq_time = CURRENT_TIME;
req->rq_timeout = obd_timeout;
req->rq_reqmsg->addr = req->rq_import->imp_handle.addr;
req->rq_reqmsg->cookie = req->rq_import->imp_handle.cookie;
c->c_generation = 1;
c->c_epoch = 1;
c->c_bootcount = 0;
+ c->c_flags = 0;
if (uuid)
strcpy(c->c_remote_uuid, uuid);
INIT_LIST_HEAD(&c->c_delayed_head);
goto out;
}
+
/* else (NEWCONN) */
- if (conn->c_recovd_data.rd_phase != RD_PREPARING)
+ spin_lock(&conn->c_lock);
+
+ /* whatever happens, reset the INVALID flag */
+ conn->c_flags &= ~CONN_INVALID;
+
+ /* XXX is this a good check? should we allow readdressing of
+ * XXX conns that aren't in recovery?
+ */
+ if (conn->c_recovd_data.rd_phase != RD_PREPARING) {
+ spin_unlock(&conn->c_lock);
GOTO(out, rc = -EALREADY);
+ }
- spin_lock(&conn->c_lock);
if (data->ioc_inllen2) {
CERROR("conn %p UUID change %s -> %s\n",
conn, conn->c_remote_uuid, data->ioc_inlbuf2);
#!/bin/sh
-set -vx
-
-SRCDIR="`dirname $0`"
-. $SRCDIR/common.sh
+fail() {
+ echo "ERROR: $1" 1>&2
+ [ $2 ] && RC=$2 || RC=1
+ exit $RC
+}
test_fail() {
+ oldtimeout=`cat /proc/sys/lustre/timeout`
+ echo $TIMEOUT > /proc/sys/lustre/timeout
echo $1 > /proc/sys/lustre/fail_loc
shift
$* &
- sleep 1
+ sleep $TIMEOUT
+ sleep 2 # fudge
kill -9 $!
+ echo $oldtimeout > /proc/sys/lustre/timeout
echo 0 > /proc/sys/lustre/fail_loc
- umount /mnt/lustre || fail "cannot unmount /mnt/lustre"
- mount -t lustre_lite -o device=`$OBDCTL name2dev OSCDEV` none /mnt/lustre || fail "cannot remount device '`$OBDCTL name2dev OSCDEV`' on /mnt/lustre"
+ umount -f /mnt/lustre || fail "cannot unmount /mnt/lustre"
+ mount -t lustre_lite -o "osc=$OSC,mdc=$MDC" none /mnt/lustre || \
+ fail "cannot remount $OSC/$MDC on /mnt/lustre"
}
-[ -c /dev/request ] || mknod /dev/request c 10 244
+set -vx
+
+LCTL=../utils/lctl
+OSC=OSC_localhost_UUID
+MDC=MDC_client1_UUID
+TIMEOUT=5 # complete in finite time
[ "`mount | grep /mnt/lustre`" ] || echo | sh llmount.sh || exit -1
# GETATTR_NET - ls will hang on the getattr
-test_fail 0x102 ls -l /mnt/lustre
+# test_fail 0x102 ls -l /mnt/lustre
# READPAGE_NET - ls will hang reading in new pages (lost+found is not in cache)
test_fail 0x104 ls /mnt/lustre
--get <url> URL to fetch a config file
--node <nodename> Load config for <nodename>
-d | --cleanup Cleans up config. (Shutdown)
+-f | --force Unmount with \"umount -f\" during shutdown
-v | --verbose Print system commands as they are run
-h | --help Print this help
--gdb Prints message after creating gdb module script
self._gdb = 0
self._nomod = 0
self._nosetup = 0
+ self._force = 0
# parameters
self._modules = None
self._node = None
if flag: self._nosetup = flag
return self._nosetup
+ def force(self, flag = None):
+ if flag: self._force = flag
+ return self._flag
+
def node(self, val = None):
if val: self._node = val
return self._node
def cleanup(self):
self.info(self.path, self.mds_uuid,self.lov_uuid)
- (rc, out) = run("umount", self.path)
+ if config.force():
+ (rc, out) = run("umount -f", self.path)
+ else:
+ (rc, out) = run("umount", self.path)
if rc:
log("umount failed, cleanup will most likely not work.")
l = lookup(self.dom_node.parentNode, self.lov_uuid)
config.portals = a
if o == "--lustre":
config.lustre = a
- if o == "--reformat":
+ if o == "--reformat":
config.reformat(1)
- if o == "--node":
+ if o == "--node":
config.node(a)
- if o == "--gdb":
+ if o == "--gdb":
config.gdb(1)
- if o == "--nomod":
+ if o == "--nomod":
config.nomod(1)
- if o == "--nosetup":
+ if o == "--nosetup":
config.nosetup(1)
- if o == "--dump":
+ if o == "--dump":
config.dump_file(a)
+ if o in ("--force", "-f"):
+ config.force(1)
return args
def fetch(url):