Whamcloud - gitweb
* merge b_recovery into HEAD
authorrread <rread>
Fri, 20 Dec 2002 08:22:42 +0000 (08:22 +0000)
committerrread <rread>
Fri, 20 Dec 2002 08:22:42 +0000 (08:22 +0000)
(the midnight merge troll rides again)

13 files changed:
lustre/include/linux/lustre_ha.h
lustre/include/linux/lustre_lib.h
lustre/include/linux/lustre_net.h
lustre/lib/client.c
lustre/lov/lov_obd.c
lustre/mdc/mdc_request.c
lustre/osc/osc_request.c
lustre/ptlrpc/client.c
lustre/ptlrpc/rpc.c
lustre/tests/local.sh
lustre/tests/lov.xml [new file with mode: 0644]
lustre/utils/lconf.in
lustre/utils/lmc

index 1e6596b..bfac4c3 100644 (file)
@@ -29,6 +29,7 @@ struct ptlrpc_connection;
 #define PTLRPC_RECOVD_PHASE_PREPARE  1
 #define PTLRPC_RECOVD_PHASE_RECOVER  2
 #define PTLRPC_RECOVD_PHASE_FAILURE  3
+#define PTLRPC_RECOVD_PHASE_NOTCONN  4
 
 typedef int (*ptlrpc_recovery_cb_t)(struct recovd_data *, int);
 
index 0372504..aa58c49 100644 (file)
@@ -64,6 +64,7 @@ int client_obd_disconnect(struct lustre_handle *conn);
 int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf);
 int client_obd_cleanup(struct obd_device * obddev);
 struct client_obd *client_conn2cli(struct lustre_handle *conn); 
+struct obd_device *client_tgtuuid2obd(char *tgtuuid);
 
 int target_revoke_connection(struct recovd_data *rd, int phase);
 
index fb060d0..142db3b 100644 (file)
@@ -308,6 +308,7 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req);
 void ptlrpc_continue_req(struct ptlrpc_request *req);
 int ptlrpc_replay_req(struct ptlrpc_request *req);
 void ptlrpc_restart_req(struct ptlrpc_request *req);
+void ptlrpc_abort_inflight(struct obd_import *imp);
 
 struct ptlrpc_request *ptlrpc_prep_req(struct obd_import *imp, int opcode,
                                        int count, int *lengths, char **bufs);
index c75b399..03fa4e2 100644 (file)
@@ -40,6 +40,24 @@ struct client_obd *client_conn2cli(struct lustre_handle *conn)
         return &export->exp_obd->u.cli;
 }
 
+struct obd_device *client_tgtuuid2obd(char *tgtuuid)
+{
+        int i;
+
+        for (i=0; i < MAX_OBD_DEVICES; i++) {
+                struct obd_device *obd = &obd_dev[i];
+                if ((strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) == 0) ||
+                    (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0)) {
+                        struct client_obd *cli = &obd->u.cli;
+                        if (strncmp(tgtuuid, cli->cl_target_uuid, 
+                                    sizeof(cli->cl_target_uuid)) == 0)
+                                return obd;
+                }
+        }
+
+        return NULL;
+}
+
 int client_obd_setup(struct obd_device *obddev, obd_count len, void *buf)
 {
         struct obd_ioctl_data* data = buf;
index d2dc23c..fe5aad4 100644 (file)
@@ -158,7 +158,7 @@ static int lov_connect(struct lustre_handle *conn, struct obd_device *obd,
                 memcpy(lov->tgts[i].uuid, uuidarray[i], sizeof(*uuidarray));
 
         for (i = 0; i < desc->ld_tgt_count; i++) {
-                struct obd_device *tgt = class_uuid2obd(uuidarray[i]);
+                struct obd_device *tgt = client_tgtuuid2obd(uuidarray[i]);
                 int rc2;
 
                 if (!tgt) {
index daeccf1..c856d10 100644 (file)
@@ -646,6 +646,7 @@ static int mdc_recover(struct obd_import *imp, int phase)
                                        NULL, LDLM_FL_LOCAL_ONLY);
                 RETURN(0);
             case PTLRPC_RECOVD_PHASE_RECOVER:
+        reconnect:
                 rc = ptlrpc_reconnect_import(imp, MDS_CONNECT);
                 if (rc == EALREADY)
                         RETURN(ptlrpc_replay(imp, 0));
@@ -671,6 +672,12 @@ static int mdc_recover(struct obd_import *imp, int phase)
                         RETURN(rc);
 
                 RETURN(0);
+
+            case PTLRPC_RECOVD_PHASE_NOTCONN:
+                ldlm_namespace_cleanup(imp->imp_obd->obd_namespace, 1);
+                ptlrpc_abort_inflight(imp);
+                goto reconnect;
+
             default:
                 RETURN(-EINVAL);
         }
index 30aa36d..1e2f72e 100644 (file)
@@ -958,10 +958,11 @@ static int osc_recover(struct obd_import *imp, int phase)
         ENTRY;
 
         switch(phase) {
+
             case PTLRPC_RECOVD_PHASE_PREPARE: {
                 struct ldlm_namespace *ns = imp->imp_obd->obd_namespace;
                 ldlm_namespace_cleanup(ns, 1 /* no network ops */);
-                abort_inflight_for_import(imp);
+                ptlrpc_abort_inflight(imp);
                 set_osc_active(imp, 0 /* inactive */);
                 RETURN(0);
             }
@@ -981,6 +982,10 @@ static int osc_recover(struct obd_import *imp, int phase)
                 set_osc_active(imp, 1 /* active */);
                 RETURN(0);
 
+            case PTLRPC_RECOVD_PHASE_NOTCONN:
+                osc_recover(imp, PTLRPC_RECOVD_PHASE_PREPARE);
+                RETURN(osc_recover(imp, PTLRPC_RECOVD_PHASE_RECOVER));
+
             default:
                 RETURN(-EINVAL);
         }
index b909b75..ccaa108 100644 (file)
@@ -620,9 +620,9 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                NTOH__u32(req->rq_reqmsg->status), req->rq_xid,
                conn->c_peer.peer_nid, NTOH__u32(req->rq_reqmsg->opc));
 
+        spin_lock(&imp->imp_lock);
+        EIO_IF_INVALID(req);
         if (req->rq_level > imp->imp_level) {
-                spin_lock(&imp->imp_lock);
-                EIO_IF_INVALID(req);
                 list_del(&req->rq_list);
                 list_add_tail(&req->rq_list, &imp->imp_delayed_list);
                 spin_unlock(&imp->imp_lock);
@@ -647,9 +647,6 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 CERROR("process %d resumed\n", current->pid);
         }
  resend:
-        req->rq_timeout = obd_timeout;
-        spin_lock(&imp->imp_lock);
-        EIO_IF_INVALID(req);
 
         LASSERT(list_empty(&req->rq_list));
         list_add_tail(&req->rq_list, &imp->imp_sending_list);
@@ -716,11 +713,28 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req)
                 GOTO(out, rc = -EINVAL);
         }
 #endif
-        CDEBUG(D_NET, "got rep "LPU64"\n", req->rq_xid);
-        if (req->rq_repmsg->status == 0)
-                CDEBUG(D_NET, "--> buf %p len %d status %d\n", req->rq_repmsg,
-                       req->rq_replen, req->rq_repmsg->status);
+        DEBUG_REQ(D_NET, req, "status %d\n", req->rq_repmsg->status);
 
+        /* We're a rejected connection, need to invalidate and rebuild. */
+        if (req->rq_repmsg->status == -ENOTCONN) {
+                spin_lock(&imp->imp_lock);
+                /* If someone else is reconnecting us (CONN_RECOVD) or has
+                 * already completed it (handle mismatch), then we just need
+                 * to get out.
+                 */
+                if (imp->imp_level == LUSTRE_CONN_RECOVD ||
+                    imp->imp_handle.addr != req->rq_reqmsg->addr ||
+                    imp->imp_handle.cookie != req->rq_reqmsg->cookie) {
+                        spin_unlock(&imp->imp_lock);
+                        GOTO(out, rc = -EIO);
+                }
+                imp->imp_level = LUSTRE_CONN_RECOVD;
+                spin_unlock(&imp->imp_lock);
+                rc = imp->imp_recover(imp, PTLRPC_RECOVD_PHASE_NOTCONN);
+                if (rc)
+                        LBUG();
+                GOTO(out, rc = -EIO);
+        }
 
         if (req->rq_import->imp_flags & IMP_REPLAYABLE) {
                 spin_lock(&imp->imp_lock);
@@ -819,3 +833,35 @@ int ptlrpc_replay_req(struct ptlrpc_request *req)
         req->rq_level = old_level;
         RETURN(rc);
 }
+
+/* XXX looks a lot like super.c:invalidate_request_list, don't it? */
+void ptlrpc_abort_inflight(struct obd_import *imp)
+{
+        struct list_head *tmp, *n;
+
+        /* Make sure that no new requests get processed for this import.
+         * ptlrpc_queue_wait must (and does) hold imp_lock while testing this
+         * flag and then putting requests on sending_list or delayed_list.
+         */
+        spin_lock(&imp->imp_lock);
+        imp->imp_flags |= IMP_INVALID;
+        spin_unlock(&imp->imp_lock);
+
+        list_for_each_safe(tmp, n, &imp->imp_sending_list) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                DEBUG_REQ(D_HA, req, "inflight");
+                req->rq_flags |= PTL_RPC_FL_ERR;
+                wake_up(&req->rq_wait_for_rep);
+        }
+
+        list_for_each_safe(tmp, n, &imp->imp_delayed_list) {
+                struct ptlrpc_request *req =
+                        list_entry(tmp, struct ptlrpc_request, rq_list);
+
+                DEBUG_REQ(D_HA, req, "aborting waiting req");
+                req->rq_flags |= PTL_RPC_FL_ERR;
+                wake_up(&req->rq_wait_for_rep);
+        }
+}
index eb6acb1..1384b5d 100644 (file)
@@ -255,6 +255,7 @@ EXPORT_SYMBOL(ptlrpc_free_bulk);
 EXPORT_SYMBOL(ptlrpc_prep_bulk_page);
 EXPORT_SYMBOL(ptlrpc_free_bulk_page);
 EXPORT_SYMBOL(ll_brw_sync_wait);
+EXPORT_SYMBOL(ptlrpc_abort_inflight);
 
 /* service.c */
 EXPORT_SYMBOL(ptlrpc_init_svc);
index 7d369f4..f680f4b 100755 (executable)
@@ -2,7 +2,7 @@
 
 config=${1:-local.xml}
 
-LMC=${LMC:-../utils/lmc}
+LMC="${LMC:-../utils/lmc} -m $config"
 TMP=${TMP:-/tmp}
 
 MDSDEV=$TMP/mds1
@@ -22,15 +22,17 @@ case $kver in
 esac
 
 
+rm -f $config
+
 # create nodes
-${LMC} -o $config --add node --node localhost || exit 10
-${LMC} -o $config --add net --node  localhost --nid localhost --nettype tcp || exit 11
+${LMC} --add node --node localhost || exit 10
+${LMC} --add net --node  localhost --nid localhost --nettype tcp || exit 11
 
 # configure mds server
-${LMC} -m $config --add mds  --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20
+${LMC} --add mds  --node localhost --mds mds1 --dev $MDSDEV --size $MDSSIZE || exit 20
 
 # configure ost
-${LMC} -m $config --add ost --node localhost --obd obd1 --dev $OSTDEV --size  $OSTSIZE || exit 30
+${LMC} --add ost --node localhost --obd obd1 --dev $OSTDEV --size  $OSTSIZE || exit 30
 
 # create client config
-${LMC} -m $config --add mtpt --node localhost --path /mnt/lustre --mds mds1 --obd obd1 || exit 40
+${LMC} --add mtpt --node localhost --path /mnt/lustre --mds mds1 --obd obd1 || exit 40
diff --git a/lustre/tests/lov.xml b/lustre/tests/lov.xml
new file mode 100644 (file)
index 0000000..532c1ec
--- /dev/null
@@ -0,0 +1,70 @@
+<?xml version='1.0' encoding='UTF-8'?>
+<!DOCTYPE lustre>
+<lustre>
+  <ldlm name='ldlm' uuid='ldlm_UUID'/>
+  <node name='localhost' uuid='localhost_UUID'>
+    <profile>
+      <ldlm_ref uuidref='ldlm_UUID'/>
+      <network_ref uuidref='NET_localhost_tcp_UUID'/>
+      <mds_ref uuidref='mds1_UUID'/>
+      <lovconfig_ref uuidref='LVCFG_lov1_UUID'/>
+      <obd_ref uuidref='OBD_localhost_UUID'/>
+      <ost_ref uuidref='OST_localhost_UUID'/>
+      <obd_ref uuidref='OBD_localhost_2_UUID'/>
+      <ost_ref uuidref='OST_localhost_2_UUID'/>
+      <mountpoint_ref uuidref='MNT_localhost_UUID'/>
+    </profile>
+    <network name='NET_localhost_tcp' uuid='NET_localhost_tcp_UUID' type='tcp'>
+      <server>localhost</server>
+      <port>988</port>
+    </network>
+  </node>
+  <mds name='mds1' uuid='mds1_UUID'>
+    <fstype>extN</fstype>
+    <device size='50000'>/tmp/mds1</device>
+    <autoformat>yes</autoformat>
+    <network_ref uuidref='NET_localhost_tcp_UUID'/>
+    <node_ref uuidref='localhost_UUID'/>
+  </mds>
+  <lov name='lov1' uuid='lov1_UUID'>
+    <mds_ref uuidref='mds1_UUID'/>
+    <devices stripecount='0' stripesize='65536' pattern='0'>
+      <osc_ref uuidref='OSC_localhost_UUID'/>
+      <osc_ref uuidref='OSC_localhost_2_UUID'/>
+    </devices>
+  </lov>
+  <lovconfig name='LVCFG_lov1' uuid='LVCFG_lov1_UUID'>
+    <lov_ref uuidref='lov1_UUID'/>
+  </lovconfig>
+  <obd uuid='OBD_localhost_UUID' name='OBD_localhost' type='obdfilter'>
+    <fstype>extN</fstype>
+    <device size='100000'>/tmp/ost1</device>
+    <autoformat>no</autoformat>
+  </obd>
+  <osc name='OSC_localhost' uuid='OSC_localhost_UUID'>
+    <ost_ref uuidref='OST_localhost_UUID'/>
+    <obd_ref uuidref='OBD_localhost_UUID'/>
+  </osc>
+  <ost name='OST_localhost' uuid='OST_localhost_UUID'>
+    <network_ref uuidref='NET_localhost_tcp_UUID'/>
+    <obd_ref uuidref='OBD_localhost_UUID'/>
+  </ost>
+  <obd name='OBD_localhost_2' uuid='OBD_localhost_2_UUID' type='obdfilter'>
+    <fstype>extN</fstype>
+    <device size='100000'>/tmp/ost2</device>
+    <autoformat>no</autoformat>
+  </obd>
+  <osc name='OSC_localhost_2' uuid='OSC_localhost_2_UUID'>
+    <ost_ref uuidref='OST_localhost_2_UUID'/>
+    <obd_ref uuidref='OBD_localhost_2_UUID'/>
+  </osc>
+  <ost name='OST_localhost_2' uuid='OST_localhost_2_UUID'>
+    <network_ref uuidref='NET_localhost_tcp_UUID'/>
+    <obd_ref uuidref='OBD_localhost_2_UUID'/>
+  </ost>
+  <mountpoint name='MNT_localhost' uuid='MNT_localhost_UUID'>
+    <mds_ref uuidref='mds1_UUID'/>
+    <osc_ref uuidref='lov1_UUID'/>
+    <path>/mnt/lustre</path>
+  </mountpoint>
+</lustre>
index 0f39037..170c5d0 100755 (executable)
@@ -112,6 +112,8 @@ class Config:
         self._portals_dir = ''
        self._minlevel = 0
        self._maxlevel = 100
+        self._timeout = -1
+        self._recovery_upcall = ''
 
     def verbose(self, flag = None):
         if flag: self._verbose = flag
@@ -185,6 +187,13 @@ class Config:
         if val: self._lustre_dir = val
         return self._lustre_dir
 
+    def timeout(self, val = None):
+        if val: self._timeout = val
+        return self._timeout
+
+    def recovery_upcall(self, val = None):
+        if val: self._recovery_upcall = val
+        return self._recovery_upcall
 
 config = Config()
 
@@ -481,7 +490,6 @@ def find_prog(cmd):
         syspath.insert(0, os.path.join(cmdpath, config.portals_dir()+'/linux/utils/'))
     for d in syspath:
         prog = os.path.join(d,cmd)
-        debug(prog)
         if os.access(prog, os.X_OK):
             return prog
     return ''
@@ -917,7 +925,7 @@ class LOV(Module):
             self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
             self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
             self.pattern = get_attr_int(dev_node, 'pattern', 0)
-            self.devlist = get_all_refs(dev_node, 'osc')
+            self.devlist = get_all_refs(dev_node, 'obd')
             self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
         self.add_lustre_module('mdc', 'mdc')
         self.add_lustre_module('lov', 'lov')
@@ -925,14 +933,14 @@ class LOV(Module):
     def prepare(self):
         if is_prepared(self.uuid):
             return
-        for osc_uuid in self.devlist:
-            osc = lookup(self.dom_node.parentNode, osc_uuid)
+        for obd_uuid in self.devlist:
+            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            osc = get_osc(obd)
             if osc:
-                n = OSC(osc)
                 try:
                     # Ignore connection failures, because the LOV will DTRT with
                     # an unconnected OSC.
-                    n.prepare(ignore_connect_failure=1)
+                    osc.prepare(ignore_connect_failure=1)
                 except CommandError:
                     print "Error preparing OSC %s (inactive)\n" % osc_uuid
             else:
@@ -946,11 +954,11 @@ class LOV(Module):
     def cleanup(self):
         if not is_prepared(self.uuid):
             return
-        for osc_uuid in self.devlist:
-            osc = lookup(self.dom_node.parentNode, osc_uuid)
+        for obd_uuid in self.devlist:
+            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            osc = get_osc(obd)
             if osc:
-                n = OSC(osc)
-                n.cleanup()
+                osc.cleanup()
             else:
                 panic('osc not found:', osc_uuid)
         Module.cleanup(self)
@@ -958,11 +966,11 @@ class LOV(Module):
 
 
     def load_module(self):
-        for osc_uuid in self.devlist:
-            osc = lookup(self.dom_node.parentNode, osc_uuid)
+        for obd_uuid in self.devlist:
+            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            osc = get_osc(obd)
             if osc:
-                n = OSC(osc)
-                n.load_module()
+                osc.load_module()
                 break
             else:
                 panic('osc not found:', osc_uuid)
@@ -971,11 +979,11 @@ class LOV(Module):
 
     def cleanup_module(self):
         Module.cleanup_module(self)
-        for osc_uuid in self.devlist:
-            osc = lookup(self.dom_node.parentNode, osc_uuid)
+        for obd_uuid in self.devlist:
+            obd = lookup(self.dom_node.parentNode, obd_uuid)
+            osc = get_osc(obd)
             if osc:
-                n = OSC(osc)
-                n.cleanup_module()
+                osc.cleanup_module()
                 break
             else:
                 panic('osc not found:', osc_uuid)
@@ -1069,6 +1077,7 @@ class OBD(Module):
         self.obdtype = get_attr(dom_node, 'type')
         self.devname, self.size = get_device(dom_node)
         self.fstype = get_text(dom_node, 'fstype')
+        self.active_target = get_text(dom_node, 'active_target')
         # FIXME: if fstype not set, then determine based on kernel version
         self.format = get_text(dom_node, 'autoformat', 'yes')
         if self.fstype == 'extN':
@@ -1135,7 +1144,9 @@ class VOSC(Module):
         if dom_node.nodeName == 'lov':
             self.osc = LOV(dom_node)
         else:
-            self.osc = OSC(dom_node)
+            self.osc = get_osc(dom_node)
+    def get_uuid(self):
+        return self.osc.uuid
     def prepare(self):
         self.osc.prepare()
     def cleanup(self):
@@ -1147,10 +1158,17 @@ class VOSC(Module):
         
 
 class OSC(Module):
-    def __init__(self,dom_node):
-        Module.__init__(self, 'OSC', dom_node)
-        self.obd_uuid = get_first_ref(dom_node, 'obd')
-        self.ost_uuid = get_first_ref(dom_node, 'ost')
+    def __init__(self, dom_node, obd_name, obd_uuid, ost_uuid):
+        self.dom_node = dom_node
+        self.module_name = 'OSC'
+        self.name = 'OSC_%s' % (obd_name)
+        self.uuid = '%s_%05x' % (self.name, int(random.random() * 1048576))
+        self.kmodule_list = []
+        self._server = None
+        self._connected = 0
+
+        self.obd_uuid = obd_uuid
+        self.ost_uuid = ost_uuid
         self.lookup_server(self.ost_uuid)
         self.add_lustre_module('osc', 'osc')
 
@@ -1176,8 +1194,6 @@ class OSC(Module):
                     setup ="%s %s" %(self.obd_uuid, srv.uuid))
 
     def cleanup(self):
-        if not is_prepared(self.uuid):
-            return
         srv = self.get_server()
         if local_net(srv):
             Module.cleanup(self)
@@ -1198,18 +1214,18 @@ class ECHO_CLIENT(Module):
     def __init__(self,dom_node):
         Module.__init__(self, 'ECHO_CLIENT', dom_node)
         self.add_lustre_module('obdecho', 'obdecho')
-        self.lov_uuid = get_first_ref(dom_node, 'osc')
-        l = lookup(self.dom_node.parentNode, self.lov_uuid)
-        self.osc = VOSC(l)
+        self.obd_uuid = get_first_ref(dom_node, 'obd')
+        obd = lookup(self.dom_node.parentNode, self.obd_uuid)
+        self.osc = VOSC(obd)
 
     def prepare(self):
         if is_prepared(self.uuid):
             return
         self.osc.prepare() # XXX This is so cheating. -p
-        self.info(self.lov_uuid)
+        self.info(self.obd_uuid)
             
         lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
-                    setup = self.lov_uuid)
+                    setup = self.obd_uuid)
 
     def cleanup(self):
         if not is_prepared(self.uuid):
@@ -1229,25 +1245,26 @@ class Mountpoint(Module):
         Module.__init__(self, 'MTPT', dom_node)
         self.path = get_text(dom_node, 'path')
         self.mds_uuid = get_first_ref(dom_node, 'mds')
-        self.lov_uuid = get_first_ref(dom_node, 'osc')
+        self.obd_uuid = get_first_ref(dom_node, 'obd')
         self.add_lustre_module('mdc', 'mdc')
         self.add_lustre_module('llite', 'llite')
-        l = lookup(self.dom_node.parentNode, self.lov_uuid)
-        self.osc = VOSC(l)
+        obd = lookup(self.dom_node.parentNode, self.obd_uuid)
+        self.osc = VOSC(obd)
+
 
     def prepare(self):
         self.osc.prepare()
         mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
-        self.info(self.path, self.mds_uuid, self.lov_uuid)
+        self.info(self.path, self.mds_uuid, self.obd_uuid)
         cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
-              (self.lov_uuid, mdc_uuid, self.path)
+              (self.osc.get_uuid(), mdc_uuid, self.path)
         run("mkdir", self.path)
         ret, val = run(cmd)
         if ret:
             panic("mount failed:", self.path)
 
     def cleanup(self):
-        self.info(self.path, self.mds_uuid,self.lov_uuid)
+        self.info(self.path, self.mds_uuid,self.obd_uuid)
         if  fs_is_mounted(self.path):
             if config.force():
                 (rc, out) = run("umount", "-f", self.path)
@@ -1259,7 +1276,6 @@ class Mountpoint(Module):
         if fs_is_mounted(self.path):
             panic("fs is still mounted:", self.path)
 
-        l = lookup(self.dom_node.parentNode, self.lov_uuid)
         self.osc.cleanup()
         cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
 
@@ -1273,9 +1289,14 @@ class Mountpoint(Module):
 
 # ============================================================
 # XML processing and query
-# TODO: Change query funcs to use XPath, which is muc cleaner
-#   Or not. Originally both lconf and lmc used XPath, but it was many
-#   orders of magnitute slower, and lmc was unusable. - robert 
+
+# OSC is no longer in the xml, so we have to fake it.
+# this is getting ugly and begging for another refactoring
+def get_osc(obd_dom):
+    obd = OBD(obd_dom)
+    osc = OSC(obd_dom, obd.name, obd.uuid, obd.active_target)
+    return osc
+
 
 def get_device(obd):
     list = obd.getElementsByTagName('device')
@@ -1618,6 +1639,7 @@ def startProfile(lustreNode, profileNode, module_flag):
 # Load profile for 
 def doHost(lustreNode, hosts):
     global routes
+    global router_flag 
     dom_node = None
     for h in hosts:
         dom_node = getByName(lustreNode, h, 'node')
@@ -1627,12 +1649,16 @@ def doHost(lustreNode, hosts):
         print 'No host entry found.'
         return
 
-    if not get_attr(dom_node, 'router'):
+    if get_attr(dom_node, 'router'):
+        router_flag = 1
+    else:
+        router_flag = 0
+    recovery_upcall = get_attr(dom_node, 'recovery_upcall')
+    timeout = get_attr_int(dom_node, 'timeout')
+
+    if not router_flag:
         init_node(dom_node)
         init_route_config(lustreNode)
-    else:
-        global router_flag 
-        router_flag = 1
 
     # Two step process: (1) load modules, (2) setup lustre
     # if not cleaning, load modules first.
@@ -1649,6 +1675,9 @@ def doHost(lustreNode, hosts):
             # dump /tmp/ogdb and sleep/pause here
             log ("The GDB module script is in", script)
             time.sleep(5)
+        sys_set_timeout(timeout)
+        sys_set_recovery_upcall(recovery_upcall)
+            
             
     module_flag = not module_flag
     for profile in reflist:
@@ -1662,7 +1691,8 @@ def parse_cmdline(argv):
     long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
                  "portals=", "makeldiff", "cleanup", "noexec",
                  "help", "node=", "nomod", "nosetup",
-                 "dump=", "force", "minlevel=", "maxlevel="]
+                 "dump=", "force", "minlevel=", "maxlevel=",
+                 "timeout=", "recovery_upcall="]
     opts = []
     args = []
 
@@ -1704,6 +1734,10 @@ def parse_cmdline(argv):
                config.minlevel(a)
         if o in ("--maxlevel",):
                 config.maxlevel(a)
+        if o in ("--timeout",):
+                config.timeout(a)
+        if o in ("--recovery_upcall",):
+                config.recovery_upcall(a)
     return args
 
 def fetch(url):
@@ -1734,19 +1768,43 @@ def setupModulePath(cmd, portals_dir = PORTALS_DIR):
         dir = os.path.join(config.lustre_dir(), dir)
         config.portals_dir(dir)
 
-def sys_set_debug_path():
-    debug("debug path: ", config.debug_path())
+def sysctl(path, val):
     if config.noexec():
         return
     try:
-        fp = open('/proc/sys/portals/debug_path', 'w')
-        fp.write(config.debug_path())
+        fp = open(os.path.join('/proc/sys', path), 'w')
+        fp.write(str(val))
         fp.close()
     except IOError, e:
         print e
-             
-#/proc/sys/net/core/rmem_max
-#/proc/sys/net/core/wmem_max
+
+
+def sys_set_debug_path():
+    debug("debug path: ", config.debug_path())
+    sysctl('portals/debug_path', config.debug_path())
+
+def sys_set_recovery_upcall(upcall):
+    # the command overrides the value in the node config
+    if config.recovery_upcall():
+        upcall = config.recovery_upcall()
+    if upcall:
+        debug("setting recovery_upcall:", upcall)
+        sysctl('lustre/recovery_upcall', upcall)
+
+def sys_set_timeout(timeout):
+    # the command overrides the value in the node config
+    if config.timeout() >= 0:
+        timeout = config.timeout()
+    if timeout >= 0:
+        debug("setting timeout:", timeout)
+        sysctl('lustre/timeout', timeout)
+
+def sys_set_ptldebug(ptldebug):
+    # the command overrides the value in the node config
+    if config.ptldebug():
+        ptldebug = config.ptldebug()
+    sysctl('portals/debug', ptldebug)
+
 def sys_set_netmem_max(path, max):
     debug("setting", path, "to at least", max)
     if config.noexec():
index b4f92ea..3ea5265 100755 (executable)
@@ -32,7 +32,50 @@ from xml.dom.ext import PrettyPrint
 DEFAULT_PORT = 988 
 
 def usage():
-    print """usage: lmc --add object [object parameters]"""
+    print """usage: lmc --add object [object parameters]
+
+Object creation command summary:
+
+--add node
+  --node node_name
+  --timeout num
+  --recovery_upcall path
+
+--add net
+  --node node_name
+  --nid addr
+  --nettype tcp|elan|toe|gm
+  --port port
+  --tcpbuf size
+  --router
+
+--add mds
+  --node node_name
+  --mds mds_name
+  --dev path
+  --size size
+
+--add lov
+  --lov lov_name
+  --mds mds_name
+  --stripe_sz num
+  --stripe_cnt num
+  --stripe_pattern num
+
+-add ost
+  --node node_name
+  --obd obd_name 
+  --lov lov_name 
+  --dev path
+  --size size
+  --obduuid uuid
+  
+--add mtpt  - Mountpoint
+  --node node_name
+  --path /mnt/point
+  --mds mds_name
+  --obd obd_name OR --lov lovname
+"""
     sys.exit(1)
 
 def error(*args):
@@ -167,9 +210,10 @@ class GenConfig:
         ldlm = self.newService("ldlm", name, uuid)
         return ldlm
 
-    def obd(self, name, uuid, fs, obdtype, devname, format, dev_size=0):
+    def obd(self, name, uuid, fs, obdtype, devname, format, ost_uuid, dev_size=0):
         obd = self.newService("obd", name, uuid)
         obd.setAttribute('type', obdtype)
+        self.addElement(obd, 'active_target', ost_uuid)
         if fs:
             self.addElement(obd, "fstype", fs)
         if devname:
@@ -179,18 +223,18 @@ class GenConfig:
             self.addElement(obd, "autoformat", format)
         return obd
 
+#    def osc(self, name, uuid, obd_uuid, net_uuid):
+#        osc = self.newService("osc", name, uuid)
+#        osc.appendChild(self.ref("ost", net_uuid))
+#        osc.appendChild(self.ref("obd", obd_uuid))
+#        return osc
+
     def cobd(self, name, uuid, real_uuid, cache_uuid):
         cobd = self.newService("cobd", name, uuid)
         cobd.appendChild(self.ref("real_obd",real_uuid))
         cobd.appendChild(self.ref("cache_obd",cache_uuid))
         return cobd
 
-    def osc(self, name, uuid, obd_uuid, net_uuid):
-        osc = self.newService("osc", name, uuid)
-        osc.appendChild(self.ref("ost", net_uuid))
-        osc.appendChild(self.ref("obd", obd_uuid))
-        return osc
-
     def ost(self, name, uuid, obd_uuid, net_uuid):
         ost = self.newService("ost", name, uuid)
         ost.appendChild(self.ref("network", net_uuid))
@@ -228,13 +272,13 @@ class GenConfig:
     def mountpoint(self, name, uuid, mds_uuid, osc_uuid, path):
         mtpt = self.newService("mountpoint", name, uuid)
         mtpt.appendChild(self.ref("mds", mds_uuid))
-        mtpt.appendChild(self.ref("osc", osc_uuid))
+        mtpt.appendChild(self.ref("obd", osc_uuid))
         self.addElement(mtpt, "path", path)
         return mtpt
 
     def echo_client(self, name, uuid, osc_uuid):
         ec = self.newService("echo_client", name, uuid)
-        ec.appendChild(self.ref("osc", osc_uuid))
+        ec.appendChild(self.ref("obd", osc_uuid))
         return ec
 
 ############################################################
@@ -308,10 +352,10 @@ def get_net_uuid(lustre, node_name):
     return None
 
 
-def lov_add_osc(gen, lov, osc_uuid):
+def lov_add_obd(gen, lov, osc_uuid):
     devs = lov.getElementsByTagName('devices')
     if len(devs) == 1:
-        devs[0].appendChild(gen.ref("osc", osc_uuid))
+        devs[0].appendChild(gen.ref("obd", osc_uuid))
     else:
         error("No devices element found for LOV:", lov)
 
@@ -335,8 +379,12 @@ def do_add_node(gen, lustre,  options, node_name):
     uuid = new_uuid(node_name)
     node = gen.node(node_name, uuid)
     node_add_profile(gen, node, 'ldlm', ldlm_uuid)
-    if options.has_key('router'):
+    if has_option(options, 'router'):
         node.setAttribute('router', '1')
+    if has_option(options, 'timeout'):
+        node.setAttribute('timeout', get_option(options, 'timeout'))
+    if has_option(options, 'recovery_upcall'):
+        node.setAttribute('recovery_upcall', get_option(options, 'recovery_upcall'))
     lustre.appendChild(node)
     return node
 
@@ -446,7 +494,6 @@ def add_ost(gen, lustre, options):
         
     obdname = get_option(options, 'obd', 'OBD_'+ node_name)
     obdname = new_name(obdname)
-    oscname = new_name('OSC_'+ obdname)
     ostname = new_name('OST_'+ obdname)
     if options.has_key('obduuid'):
         obd_uuid = options['obduuid']
@@ -456,28 +503,26 @@ def add_ost(gen, lustre, options):
     else:
         obd_uuid = new_uuid(obdname)
     ost_uuid = new_uuid(ostname)
-    osc_uuid = new_uuid(oscname)
 
     net_uuid = get_net_uuid(lustre, node_name)
     if not net_uuid:
         error("NODE: ", node_name, "not found")
     
-    obd = gen.obd(obdname, obd_uuid, fstype, obdtype, devname, get_format_flag(options), size)
+    obd = gen.obd(obdname, obd_uuid, fstype, obdtype, devname, get_format_flag(options), ost_uuid,
+                  size)
     ost = gen.ost(ostname, ost_uuid, obd_uuid, net_uuid)
-    osc = gen.osc(oscname, osc_uuid, obd_uuid, ost_uuid)
     
     if lovname:
         lov = findByName(lustre, lovname, "lov")
         if not lov:
             error('add_ost:', '"'+lovname+'"', "lov element not found.")
-        lov_add_osc(gen, lov, osc_uuid)
+        lov_add_obd(gen, lov, obd_uuid)
 
     node = findByName(lustre, node_name, "node")
     node_add_profile(gen, node, 'obd', obd_uuid)
     node_add_profile(gen, node, 'ost', ost_uuid)
 
     lustre.appendChild(obd)
-    lustre.appendChild(osc)
     lustre.appendChild(ost)
 
                    
@@ -488,12 +533,9 @@ def add_cobd(gen, lustre, options):
 
     real_name = get_option(options, 'real_obd')
     cache_name = get_option(options, 'cache_obd')
-    # temp hack until merged with b_recover and OSC is removed
-    real_name = 'OSC_' + real_name
-    cache_name = 'OSC_' + cache_name
     
-    real_uuid = name2uuid(lustre, real_name, tag='osc')
-    cache_uuid = name2uuid(lustre, cache_name, tag='osc')
+    real_uuid = name2uuid(lustre, real_name, tag='obd')
+    cache_uuid = name2uuid(lustre, cache_name, tag='obd')
 
     node = findByName(lustre, node_name, "node")
     node_add_profile(gen, node, "cobd", uuid)
@@ -514,9 +556,7 @@ def add_echo_client(gen, lustre, options):
 
     lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0)
     if not lov_uuid:
-        # remove this hack when the osc uuids are removed
-        lov_name = 'OSC_' + lov_name
-        lov_uuid = name2uuid(lustre, lov_name, tag='osc', fatal=1)
+        lov_uuid = name2uuid(lustre, lov_name, tag='obd', fatal=1)
 
     echo = gen.echo_client(echoname, echo_uuid, lov_uuid)
     lustre.appendChild(echo)
@@ -574,9 +614,7 @@ def add_mtpt(gen, lustre, options):
     mds_uuid = name2uuid(lustre, mds_name, tag='mds')
     lov_uuid = name2uuid(lustre, lov_name, tag='lov', fatal=0)
     if not lov_uuid:
-        # remove this hack when OSC is removed
-        lov_name = 'OSC_' + lov_name
-        lov_uuid = name2uuid(lustre, lov_name, tag='osc', fatal=1)
+        lov_uuid = name2uuid(lustre, lov_name, tag='obd', fatal=1)
 
     uuid = new_uuid(name)
     mtpt = gen.mountpoint(name, uuid, mds_uuid, lov_uuid, path)
@@ -604,6 +642,12 @@ class OptionError (exceptions.Exception):
     def __init__(self, args):
         self.args = args
 
+def has_option(options, tag):
+    """Look for tag in options hash and return the true if set"""
+    if options.has_key(tag):
+        return 1
+    return 0
+
 def get_option(options, tag, default = None):
     """Look for tag in options hash and return the value if set. If not
     set, then if return default it is set, otherwise exception."""
@@ -627,7 +671,8 @@ def parse_cmdline(argv):
                  "mds=", "route", "router", "merge=", "format", "reformat", "output=",
                  "dev=", "size=", "obd=", "obdtype=", "obduuid=", "in=",
                  "path=", "help", "batch=", "lov=", "gw=", "lo=", "hi=",
-                 "oscref", "osc=", "real_obd=", "cache_obd=", "fstype="]
+                 "oscref", "osc=", "real_obd=", "cache_obd=", "fstype=",
+                 "timeout=", "recovery_upcall="]
     opts = []
     args = []
     options = {}
@@ -652,6 +697,14 @@ def parse_cmdline(argv):
         if o == "--obd":
             options['obd'] = a
 
+        # node options
+        if o == "--timeout":
+            options['timeout'] = a
+        if o == "--recovery_upcall":
+            options['recovery_upcall'] = a
+        if o == "--router":
+            options['router'] = 1
+        
         # network options
         if o == "--nid":
             options['nid'] = a
@@ -667,8 +720,6 @@ def parse_cmdline(argv):
             options['mtpt'] = 1
         if o == "--route":
             options['route'] = 1
-        if o == "--router":
-            options['router'] = 1
 
         # ost options
         if o == "--dev":