Whamcloud - gitweb
Allow the default stripe_count to be specified in the XML config, and also
[fs/lustre-release.git] / lustre / utils / lconf
index 5449d55..9138a4d 100755 (executable)
@@ -33,6 +33,7 @@ import xml.dom.minidom
 # Global parameters
 TCP_ACCEPTOR = ''
 MAXTCPBUF = 1048576
+DEFAULT_TCPBUF = 1048576
 #
 # Maximum number of devices to search for.
 # (the /dev/loop* nodes need to be created beforehand)
@@ -45,7 +46,7 @@ def usage():
 config.xml          Lustre configuration in xml format.
 --get <url>         URL to fetch a config file
 --node <nodename>   Load config for <nodename>
--d | --cleanup     Cleans up config. (Shutdown)
+-d | --cleanup      Cleans up config. (Shutdown)
 -v | --verbose      Print system commands as they are run
 -h | --help         Print this help 
 --gdb               Prints message after creating gdb module script
@@ -55,11 +56,11 @@ config.xml          Lustre configuration in xml format.
                     config file is doing what it should be doing. (Implies -v)
 --nomod             Skip load/unload module step.
 --nosetup           Skip device setup/cleanup step.
+--reformat          Reformat all devices (without question)
 """
     TODO = """
---ldap server      LDAP server with lustre config database
+--ldap server       LDAP server with lustre config database
 --makeldiff         Translate xml source to LDIFF 
---reformat         Reformat all devices (will confirm)
 This are perhaps not needed:
 --lustre="src dir"  Base directory of lustre sources. Used to search
                     for modules.
@@ -85,6 +86,7 @@ class Config:
         self._url = None
         self._gdb_script = '/tmp/ogdb'
         self._debug_path = '/tmp/lustre-log'
+        self._dump_file = None
         self._src_dir = None
 
     def verbose(self, flag = None):
@@ -136,8 +138,12 @@ class Config:
             return self._debug_path
 
     def src_dir(self, val = None):
-        if val: self._url = val
-        return self._url
+        if val: self._src_dir = val
+        return self._src_dir
+
+    def dump_file(self, val = None):
+        if val: self._dump_file = val
+        return self._dump_file
 
 config = Config()
 
@@ -301,6 +307,15 @@ class LCTLInterface:
   quit """ % (net, uuid, tgt, gw, tgt)
         self.run(cmds)
 
+    # add a route to a range
+    def del_route_host(self, net, uuid, gw, tgt):
+        cmds =  """
+  network %s
+  del_uuid %s
+  del_route %s
+  quit  """ % (net, uuid, tgt)
+        self.run(cmds)
+
     # disconnect one connection
     def disconnect(self, net, nid, port, servuuid):
         cmds =  """
@@ -310,12 +325,12 @@ class LCTLInterface:
   quit""" % (net, nid, servuuid)
         self.run(cmds)
 
-    # disconnect all connections
+    # disconnect all
     def disconnectAll(self, net):
         cmds =  """
   network %s
-  disconnect
   del_uuid self
+  disconnect
   quit""" % (net)
         self.run(cmds)
 
@@ -346,6 +361,13 @@ class LCTLInterface:
   quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
         self.run(cmds)
 
+    # cleanup a device
+    def dump(self, dump_file):
+        cmds = """
+  debug_kernel %s 1
+  quit""" % (dump_file)
+        self.run(cmds)
+
 # ============================================================
 # Various system-level functions
 # (ideally moved to their own module)
@@ -475,7 +497,7 @@ def init_loop(file, size, fstype):
     if dev:
         print 'WARNING file:', file, 'already mapped to', dev
         return dev
-    if not os.access(file, os.R_OK | os.W_OK):
+    if config.reformat()  or not os.access(file, os.R_OK | os.W_OK):
         run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,  file))
     loop = loop_base()
     # find next free loop
@@ -521,13 +543,27 @@ def block_dev(dev, size, fstype, format):
         
     return dev
 
-def get_local_address(net_type):
+def if2addr(iface):
+    """lookup IP address for an interface"""
+    rc, out = run("/sbin/ifconfig", iface)
+    if rc or not out:
+       return None
+    addr = string.split(out[1])[1]
+    ip = string.split(addr, ':')[1]
+    return ip
+
+def get_local_address(net_type, wildcard):
     """Return the local address for the network type."""
     local = ""
     if net_type == 'tcp':
-        # host `hostname`
-        host = socket.gethostname()
-        local = socket.gethostbyname(host)
+        if  ':' in wildcard:
+            iface, star = string.split(wildcard, ':')
+            local = if2addr(iface)
+            if not local:
+                panic ("unable to determine ip for:", wildcard)
+        else:
+            host = socket.gethostname()
+            local = socket.gethostbyname(host)
     elif net_type == 'elan':
         # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
         try:
@@ -571,6 +607,8 @@ class Module:
     def lookup_server(self, srv_uuid):
         """ Lookup a server's network information """
         net = get_ost_net(self.dom_node.parentNode, srv_uuid)
+        if not net:
+            panic ("Unable to find a server for:", srv_uuid)
         self._server = Network(net)
 
     def get_server(self):
@@ -629,9 +667,12 @@ class Module:
         """Unload the modules in the list in reverse order."""
         rev = self.kmodule_list
         rev.reverse()
-        for mod in rev:
+        for dev_dir, mod in rev:
             if not self.mod_loaded(mod):
                 continue
+            # debug hack
+            if mod == 'portals' and config.dump_file():
+                lctl.dump(config.dump_file())
             log('unloading module:', mod)
             if config.noexec():
                 continue
@@ -647,16 +688,17 @@ class Network(Module):
         self.net_type = get_attr(dom_node,'type')
         self.nid = get_text(dom_node, 'server', '*')
         self.port = get_text_int(dom_node, 'port', 0)
-        self.send_mem = get_text_int(dom_node, 'send_mem', 65536)
-        self.recv_mem = get_text_int(dom_node, 'recv_mem', 65536)
-        if self.nid == '*':
-            self.nid = get_local_address(self.net_type)
+        self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
+        self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
+        if '*' in self.nid:
+            self.nid = get_local_address(self.net_type, self.nid)
             if not self.nid:
-                panic("unable to set nid for", self.net_type)
+                panic("unable to set nid for", self.net_type, self.nid)
+            debug("nid:", self.nid)
 
         self.add_module('portals/linux/oslib/', 'portals')
-       if node_needs_router():
-           self.add_module('portals/linux/router', 'kptlrouter')
+        if node_needs_router():
+            self.add_module('portals/linux/router', 'kptlrouter')
         if self.net_type == 'tcp':
             self.add_module('portals/linux/socknal', 'ksocknal')
         if self.net_type == 'elan':
@@ -669,9 +711,9 @@ class Network(Module):
     def prepare(self):
         self.info(self.net_type, self.nid, self.port)
         if self.net_type == 'tcp':
-            ret = run_daemon(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
+            ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
             if ret:
-                raise CommandError(TCP_ACCEPTOR, 'failed', ret)
+                raise CommandError(TCP_ACCEPTOR, out, ret)
         ret = self.dom_node.getElementsByTagName('route_tbl')
         for a in ret:
             for r in a.getElementsByTagName('route'):
@@ -703,14 +745,14 @@ class Network(Module):
                     if not srv:
                         panic("no server for nid", lo)
                     else:
-                       try:
+                        try:
                             lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
-                       except CommandError, e:
-                               print "disconnect failed: ", self.name
-                               e.dump()
-               try:
+                        except CommandError, e:
+                            print "disconnect failed: ", self.name
+                            e.dump()
+                try:
                     lctl.del_route(self.net_type, self.nid, lo, hi)
-               except CommandError, e:
+                except CommandError, e:
                     print "del_route failed: ", self.name
                     e.dump()
               
@@ -740,21 +782,83 @@ class LDLM(Module):
 class LOV(Module):
     def __init__(self,dom_node):
         Module.__init__(self, 'LOV', dom_node)
-        self.stripe_sz = get_attr_int(dom_node, 'stripesize', 65536)
-        self.stripe_off = get_attr_int(dom_node, 'stripeoffset', 0)
-        self.pattern = get_attr_int(dom_node, 'pattern', 0)
-        self.mdsuuid = get_first_ref(dom_node, 'mds')
-        mds= lookup(dom_node.parentNode, self.mdsuuid)
-        self.mdsname = getName(mds)
-        self.devlist = get_all_refs(dom_node, 'osc')
-        self.stripe_cnt = len(self.devlist)
+        self.mds_uuid = get_first_ref(dom_node, 'mds')
+        mds= lookup(dom_node.parentNode, self.mds_uuid)
+        self.mds_name = getName(mds)
+        devs = dom_node.getElementsByTagName('devices')
+        if len(devs) > 0:
+            dev_node = devs[0]
+            self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
+            self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
+            self.pattern = get_attr_int(dev_node, 'pattern', 0)
+            self.devlist = get_all_refs(dev_node, 'osc')
+            self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
+        self.add_module('lustre/mdc', 'mdc')
+        self.add_module('lustre/lov', 'lov')
 
     def prepare(self):
-        self.info(self.mdsuuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern,
-        self.devlist, self.mdsname)
-        lctl.lovconfig(self.uuid, self.mdsname, self.stripe_cnt,
-                       self.stripe_sz, self.stripe_off, self.pattern,
-                       string.join(self.devlist))
+        for osc_uuid in self.devlist:
+            osc = lookup(self.dom_node.parentNode, osc_uuid)
+            if osc:
+                n = OSC(osc)
+                n.prepare()
+            else:
+                panic('osc not found:', osc_uuid)
+        mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+        self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
+                  self.stripe_off, self.pattern, self.devlist, self.mds_name)
+        lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
+                    setup ="%s" % (mdc_uuid))
+
+    def cleanup(self):
+        for osc_uuid in self.devlist:
+            osc = lookup(self.dom_node.parentNode, osc_uuid)
+            if osc:
+                n = OSC(osc)
+                n.cleanup()
+            else:
+                panic('osc not found:', osc_uuid)
+        Module.cleanup(self)
+        cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+    def load_module(self):
+        for osc_uuid in self.devlist:
+            osc = lookup(self.dom_node.parentNode, osc_uuid)
+            if osc:
+                n = OSC(osc)
+                n.load_module()
+                break
+            else:
+                panic('osc not found:', osc_uuid)
+        Module.load_module(self)
+    def cleanup_module(self):
+        Module.cleanup_module(self)
+        for osc_uuid in self.devlist:
+            osc = lookup(self.dom_node.parentNode, osc_uuid)
+            if osc:
+                n = OSC(osc)
+                n.cleanup_module()
+                break
+            else:
+                panic('osc not found:', osc_uuid)
+
+class LOVConfig(Module):
+    def __init__(self,dom_node):
+        Module.__init__(self, 'LOVConfig', dom_node)
+        self.lov_uuid = get_first_ref(dom_node, 'lov')
+        l = lookup(dom_node.parentNode, self.lov_uuid)
+        self.lov = LOV(l)
+        
+    def prepare(self):
+        lov = self.lov
+        self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern,
+        lov.devlist, lov.mds_name)
+        lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
+                       lov.stripe_sz, lov.stripe_off, lov.pattern,
+                       string.join(lov.devlist))
+
+    def cleanup(self):
+        #nothing to do here
+        pass
 
 
 class MDS(Module):
@@ -777,19 +881,30 @@ class MDS(Module):
         Module.cleanup(self)
         clean_loop(self.devname)
 
+# Very unusual case, as there is no MDC element in the XML anymore
+# Builds itself from an MDS node
 class MDC(Module):
     def __init__(self,dom_node):
-        Module.__init__(self, 'MDC', dom_node)
-        self.mds_uuid = get_first_ref(dom_node, 'mds')
-        self.lookup_server(self.mds_uuid)
+        self.mds = MDS(dom_node)
+        self.dom_node = dom_node
+        self.module_name = 'MDC'
+        self.kmodule_list = []
+        self._server = None
+        self._connected = 0
+
+        host = socket.gethostname()
+        self.name = 'MDC_'+host
+        self.uuid = self.name+'_UUID'
+
+        self.lookup_server(self.mds.uuid)
         self.add_module('lustre/mdc', 'mdc')
 
     def prepare(self):
-        self.info(self.mds_uuid)
+        self.info(self.mds.uuid)
         srv = self.get_server()
         lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
         lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
-                        setup ="%s %s" %(self.mds_uuid, srv.uuid))
+                        setup ="%s %s" %(self.mds.uuid, srv.uuid))
             
 class OBD(Module):
     def __init__(self, dom_node):
@@ -829,6 +944,25 @@ class OST(Module):
         lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
                     setup ="%s" % (self.obd_uuid))
 
+
+# virtual interface for  OSC and LOV
+class VOSC(Module):
+    def __init__(self,dom_node):
+        Module.__init__(self, 'VOSC', dom_node)
+        if dom_node.nodeName == 'lov':
+            self.osc = LOV(dom_node)
+        else:
+            self.osc = OSC(dom_node)
+    def prepare(self):
+        self.osc.prepare()
+    def cleanup(self):
+        self.osc.cleanup()
+    def load_module(self):
+        self.osc.load_module()
+    def cleanup_module(self):
+        self.osc.cleanup_module()
+        
+
 class OSC(Module):
     def __init__(self,dom_node):
         Module.__init__(self, 'OSC', dom_node)
@@ -852,61 +986,57 @@ class OSC(Module):
         lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
                     setup ="%s %s" %(self.obd_uuid, srv.uuid))
 
+    def cleanup(self):
+        srv = self.get_server()
+        if local_net(srv):
+            Module.cleanup(self)
+        else:
+            self.info(self.obd_uuid, self.ost_uuid)
+            r =  find_route(srv)
+            if r:
+                lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
+            Module.cleanup(self)
+            
 
 class Mountpoint(Module):
     def __init__(self,dom_node):
         Module.__init__(self, 'MTPT', dom_node)
         self.path = get_text(dom_node, 'path')
-        self.mdc_uuid = get_first_ref(dom_node, 'mdc')
+        self.mds_uuid = get_first_ref(dom_node, 'mds')
         self.lov_uuid = get_first_ref(dom_node, 'osc')
-        self.add_module('lustre/osc', 'osc')
-       # should add lov only if needed
-        self.add_module('lustre/lov', 'lov')
+        self.add_module('lustre/mdc', 'mdc')
         self.add_module('lustre/llite', 'llite')
+        l = lookup(self.dom_node.parentNode, self.lov_uuid)
+        self.osc = VOSC(l)
 
     def prepare(self):
-        l = lookup(self.dom_node.parentNode, self.lov_uuid)
-        if l.nodeName == 'lov':
-            lov = LOV(l)
-            for osc_uuid in lov.devlist:
-                osc = lookup(self.dom_node.parentNode, osc_uuid)
-                if osc:
-                    n = OSC(osc)
-                    n.prepare()
-                else:
-                    panic('osc not found:', osc_uuid)
-            lctl.newdev(attach="lov %s %s" % (lov.name, lov.uuid),
-                        setup ="%s" % (self.mdc_uuid))
-        else:
-            osc = OSC(l)
-            osc.prepare()
-            
-        self.info(self.path, self.mdc_uuid,self.lov_uuid)
+        self.osc.prepare()
+        mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+
+        self.info(self.path, self.mds_uuid,self.lov_uuid)
         cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
-              (self.lov_uuid, self.mdc_uuid, self.path)
+              (self.lov_uuid, mdc_uuid, self.path)
         run("mkdir", self.path)
         ret, val = run(cmd)
         if ret:
             panic("mount failed:", self.path)
+
     def cleanup(self):
-        self.info(self.path, self.mdc_uuid,self.lov_uuid)
+        self.info(self.path, self.mds_uuid,self.lov_uuid)
         (rc, out) = run("umount", self.path)
         if rc:
             log("umount failed, cleanup will most likely not work.")
         l = lookup(self.dom_node.parentNode, self.lov_uuid)
-        if l.nodeName == 'lov':
-            lov = LOV(l)
-            for osc_uuid in lov.devlist:
-                osc = lookup(self.dom_node.parentNode, osc_uuid)
-                if osc:
-                    n = OSC(osc)
-                    n.cleanup()
-                else:
-                    panic('osc not found:', osc_uuid)
-        else:
-            osc = OSC(l)
-            osc.cleanup()
-            
+        self.osc.cleanup()
+        cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+
+    def load_module(self):
+        self.osc.load_module()
+        Module.load_module(self)
+    def cleanup_module(self):
+        Module.cleanup_module(self)
+        self.osc.cleanup_module()
+
 
 # ============================================================
 # XML processing and query
@@ -1041,7 +1171,7 @@ def getServiceLevel(dom_node):
         return 40
     elif type in ('mdc','osc'):
         return 50
-    elif type in ('lov',):
+    elif type in ('lov', 'lovconfig'):
         return 60
     elif type in ('mountpoint',):
         return 70
@@ -1071,7 +1201,35 @@ def getByName(lustreNode, name, tag):
     return None
     
 
-
+############################################################
+# MDC UUID hack - 
+# FIXME: clean this mess up!
+#
+mdc_uuid = None
+def prepare_mdc(dom_node, mds_uuid):
+    global mdc_uuid
+    mds_node = lookup(dom_node, mds_uuid);
+    if not mds_node:
+        panic("no mds:", mds_uuid)
+    if mdc_uuid:
+        return mdc_uuid
+    mdc = MDC(mds_node)
+    mdc.prepare()
+    mdc_uuid = mdc.uuid
+    return mdc_uuid
+
+mdc_cleaned = None
+def cleanup_mdc(dom_node, mds_uuid):
+    global mdc_cleaned
+    mds_node = lookup(dom_node, mds_uuid);
+    if not mds_node:
+        panic("no mds:", mds_uuid)
+    if not mdc_cleaned:
+        mdc = MDC(mds_node)
+        mdc.cleanup()
+        mdc_uuid = None
+        mdc_cleaned = 'yes'
+        
 
 ############################################################
 # routing ("rooting")
@@ -1113,7 +1271,7 @@ def init_route_config(lustre):
     list = lustre.getElementsByTagName('node')
     for node in list:
         if get_attr(node, 'router'):
-           router_flag = 1
+            router_flag = 1
             for (local_type, local_nid) in local_node:
                 gw = None
                 netlist = node.getElementsByTagName('network')
@@ -1162,6 +1320,8 @@ def startService(dom_node, module_flag):
         n = LDLM(dom_node)
     elif type == 'lov':
         n = LOV(dom_node)
+    elif type == 'lovconfig':
+        n = LOVConfig(dom_node)
     elif type == 'network':
         n = Network(dom_node)
     elif type == 'obd':
@@ -1171,7 +1331,7 @@ def startService(dom_node, module_flag):
     elif type == 'mds':
         n = MDS(dom_node)
     elif type == 'osc':
-        n = OSC(dom_node)
+        n = VOSC(dom_node)
     elif type == 'mdc':
         n = MDC(dom_node)
     elif type == 'mountpoint':
@@ -1230,8 +1390,8 @@ def doHost(lustreNode, hosts):
         init_node(dom_node)
         init_route_config(lustreNode)
     else:
-       global router_flag 
-       router_flag = 1
+        global router_flag 
+        router_flag = 1
 
     # Two step process: (1) load modules, (2) setup lustre
     # if not cleaning, load modules first.
@@ -1260,7 +1420,8 @@ def parse_cmdline(argv):
     short_opts = "hdnv"
     long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
                  "portals=", "makeldiff", "cleanup", "noexec",
-                 "help", "node=", "get=", "nomod", "nosetup"]
+                 "help", "node=", "get=", "nomod", "nosetup",
+                 "dump="]
     opts = []
     args = []
     try:
@@ -1295,6 +1456,8 @@ def parse_cmdline(argv):
             config.nomod(1)
         if o  == "--nosetup":
             config.nosetup(1)
+        if o  == "--dump":
+            config.dump_file(a)
     return args
 
 def fetch(url):
@@ -1376,7 +1539,8 @@ def main():
     debug("configuring for host: ", node_list)
 
     if len(host) > 0:
-        config._debug_path = '/tmp/lustre-log-' + host
+        config._debug_path = config._debug_path + '-' + host
+        config._gdb_script = config._gdb_script + '-' + host
 
     TCP_ACCEPTOR = find_prog('acceptor')
     if not TCP_ACCEPTOR: