# Global parameters
TCP_ACCEPTOR = ''
-
+MAXTCPBUF = 1048576
+DEFAULT_TCPBUF = 1048576
#
# Maximum number of devices to search for.
# (the /dev/loop* nodes need to be created beforehand)
config.xml Lustre configuration in xml format.
--get <url> URL to fetch a config file
--v | --verbose Print system commands as they are run
--d | --debug Print system commands, but does not run them
--node <nodename> Load config for <nodename>
---cleanup Cleans up config. (Shutdown)
+-d | --cleanup Cleans up config. (Shutdown)
+-v | --verbose Print system commands as they are run
-h | --help Print this help
---gdb Create a gdb script to load the modules. Prints message
- after creating script and sleeps for 5 seconds.
+--gdb Prints message after creating gdb module script
+ and sleeps for 5 seconds.
+-n | --noexec Prints the commands and steps that will be run for a
+ config without executing them. This can used to check if a
+ config file is doing what it should be doing. (Implies -v)
+--nomod Skip load/unload module step.
+--nosetup Skip device setup/cleanup step.
+--reformat Reformat all devices (without question)
"""
TODO = """
---ldap server LDAP server with lustre config database
---reformat Reformat all devices (will confirm)
+--ldap server LDAP server with lustre config database
+--makeldiff Translate xml source to LDIFF
+This are perhaps not needed:
--lustre="src dir" Base directory of lustre sources. Used to search
for modules.
--portals=src Portals source
---makeldiff Translate xml source to LDIFF
"""
sys.exit()
self._url = None
self._gdb_script = '/tmp/ogdb'
self._debug_path = '/tmp/lustre-log'
+ self._dump_file = None
self._src_dir = None
def verbose(self, flag = None):
return self._debug_path
def src_dir(self, val = None):
- if val: self._url = val
- return self._url
+ if val: self._src_dir = val
+ return self._src_dir
+
+ def dump_file(self, val = None):
+ if val: self._dump_file = val
+ return self._dump_file
config = Config()
# debugging and error funcs
def fixme(msg = "this feature"):
- raise RuntimeError, msg + ' not implmemented yet.'
+ raise LconfError, msg + ' not implmemented yet.'
def panic(*args):
msg = string.join(map(str,args))
- print msg
if not config.noexec():
- raise RuntimeError, msg
+ raise LconfError(msg)
+ else:
+ print "! " + msg
def log(*args):
msg = string.join(map(str,args))
# ============================================================
# locally defined exceptions
class CommandError (exceptions.Exception):
- def __init__(self, args=None):
+ def __init__(self, cmd_name, cmd_err, rc=None):
+ self.cmd_name = cmd_name
+ self.cmd_err = cmd_err
+ self.rc = rc
+
+ def dump(self):
+ import types
+ if type(self.cmd_err) == types.StringType:
+ if self.rc:
+ print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
+ else:
+ print "! %s: %s" % (self.cmd_name, self.cmd_err)
+ elif type(self.cmd_err) == types.ListType:
+ if self.rc:
+ print "! %s (error %d):" % (self.cmd_name, self.rc)
+ else:
+ print "! %s:" % (self.cmd_name)
+ for s in self.cmd_err:
+ print "> %s" %(string.strip(s))
+ else:
+ print self.cmd_err
+
+class LconfError (exceptions.Exception):
+ def __init__(self, args):
self.args = args
+
# ============================================================
# handle lctl interface
class LCTLInterface:
debug('! lctl not found')
self.lctl = 'lctl'
else:
- raise CommandError, "unable to find lctl binary."
+ raise CommandError('lctl', "unable to find lctl binary.")
def run(self, cmds):
"""
p.tochild.write(cmds + "\n")
p.tochild.close()
out = p.fromchild.readlines()
- ret = p.poll()
- for l in out:
- debug('lctl:',string.strip(l))
err = p.childerr.readlines()
+ ret = p.wait()
if ret or len(err):
- log (self.lctl, "error:", ret)
- logall(err)
- raise CommandError, err
+ raise CommandError(self.lctl, err, ret)
return ret, out
+
def network(self, net, nid):
""" initialized network and add "self" """
# Idea: "mynid" could be used for all network types to add "self," and then
self.run(cmds)
- # create a new connection
- def connect(self, net, nid, port, servuuid, send_buf, read_buf):
- # XXX: buf size params not used yet
- cmds = """
+ # create a new connection
+ def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
+ if net == 'tcp':
+ cmds = """
network %s
+ add_uuid %s %s
+ send_mem %d
+ recv_mem %d
connect %s %d
+ quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
+ else:
+ cmds = """
+ network %s
add_uuid %s %s
- quit""" % (net, nid, port, servuuid, nid)
+ connect %s %d
+ quit""" % (net, servuuid, nid, nid, port, )
+
self.run(cmds)
- # create a new connection
- def add_route(self, net, to, via):
+ # add a route to a range
+ def add_route(self, net, gw, lo, hi):
cmds = """
- """
- #self.run(cmds)
+ network %s
+ add_route %s %s %s
+ quit """ % (net, gw, lo, hi)
+ self.run(cmds)
+
+
+ # add a route to a range
+ def del_route(self, net, gw, lo, hi):
+ cmds = """
+ network %s
+ del_route %s
+ quit """ % (net, lo)
+ self.run(cmds)
+
+ # add a route to a host
+ def add_route_host(self, net, uuid, gw, tgt):
+ cmds = """
+ network %s
+ add_uuid %s %s
+ add_route %s %s
+ quit """ % (net, uuid, tgt, gw, tgt)
+ self.run(cmds)
+
+ # add a route to a range
+ def del_route_host(self, net, uuid, gw, tgt):
+ cmds = """
+ network %s
+ del_uuid %s
+ del_route %s
+ quit """ % (net, uuid, tgt)
+ self.run(cmds)
# disconnect one connection
def disconnect(self, net, nid, port, servuuid):
quit""" % (net, nid, servuuid)
self.run(cmds)
- # disconnect all connections
+ # disconnect all
def disconnectAll(self, net):
cmds = """
network %s
- disconnect
del_uuid self
+ disconnect
quit""" % (net)
self.run(cmds)
quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
self.run(cmds)
+ # cleanup a device
+ def dump(self, dump_file):
+ cmds = """
+ debug_kernel %s 1
+ quit""" % (dump_file)
+ self.run(cmds)
+
# ============================================================
# Various system-level functions
# (ideally moved to their own module)
if module:
return module
-def find_module(src_dir, modname):
+def find_module(src_dir, dev_dir, modname):
mod = '%s.o' % (modname)
- search = (src_dir + "/lustre", src_dir + "/portals")
- for d in search:
- try:
- module = do_find_file(d, mod)
- if module:
- return module
- except OSError:
- pass
+ module = src_dir +'/'+ dev_dir +'/'+ mod
+ try:
+ if os.access(module, os.R_OK):
+ return module
+ except OSError:
+ pass
return None
# is the path a block device?
if dev:
print 'WARNING file:', file, 'already mapped to', dev
return dev
- if not os.access(file, os.R_OK | os.W_OK):
+ if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
loop = loop_base()
# find next free loop
log('unable to clean loop device:', dev, 'for file:', file)
logall(out)
+# determine if dev is formatted as a <fstype> filesystem
+def need_format(fstype, dev):
+ # FIXME don't know how to implement this
+ return 0
+
# initialize a block device if needed
def block_dev(dev, size, fstype, format):
if config.noexec(): return dev
if not is_block(dev):
dev = init_loop(dev, size, fstype)
- if (format == 'yes'):
+ if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
mkfs(fstype, dev)
+
+# else:
+# panic("device:", dev,
+# "not prepared, and autoformat is not set.\n",
+# "Rerun with --reformat option to format ALL filesystems")
+
return dev
+def if2addr(iface):
+ """lookup IP address for an interface"""
+ rc, out = run("/sbin/ifconfig", iface)
+ if rc or not out:
+ return None
+ addr = string.split(out[1])[1]
+ ip = string.split(addr, ':')[1]
+ return ip
+
+def get_local_address(net_type, wildcard):
+ """Return the local address for the network type."""
+ local = ""
+ if net_type == 'tcp':
+ if ':' in wildcard:
+ iface, star = string.split(wildcard, ':')
+ local = if2addr(iface)
+ if not local:
+ panic ("unable to determine ip for:", wildcard)
+ else:
+ host = socket.gethostname()
+ local = socket.gethostbyname(host)
+ elif net_type == 'elan':
+ # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
+ try:
+ fp = open('/proc/elan/device0/position', 'r')
+ lines = fp.readlines()
+ fp.close()
+ for l in lines:
+ a = string.split(l)
+ if a[0] == 'NodeId':
+ local = a[1]
+ break
+ except IOError, e:
+ log(e)
+ elif net_type == 'gm':
+ fixme("automatic local address for GM")
+ return local
+
+
+
# ============================================================
# Classes to prepare and cleanup the various objects
#
""" Base class for the rest of the modules. The default cleanup method is
defined here, as well as some utilitiy funcs.
"""
- def __init__(self, tag_name, node):
- self.dom_node = node
- self.tag_name = tag_name
- self.name = node.getAttribute('name')
- self.uuid = node.getAttribute('uuid')
+ def __init__(self, module_name, dom_node):
+ self.dom_node = dom_node
+ self.module_name = module_name
+ self.name = get_attr(dom_node, 'name')
+ self.uuid = get_attr(dom_node, 'uuid')
self.kmodule_list = []
+ self._server = None
+ self._connected = 0
def info(self, *args):
msg = string.join(map(str,args))
- print self.tag_name + ":", self.name, self.uuid, msg
+ print self.module_name + ":", self.name, self.uuid, msg
+
+
+ def lookup_server(self, srv_uuid):
+ """ Lookup a server's network information """
+ net = get_ost_net(self.dom_node.parentNode, srv_uuid)
+ if not net:
+ panic ("Unable to find a server for:", srv_uuid)
+ self._server = Network(net)
+
+ def get_server(self):
+ return self._server
def cleanup(self):
""" default cleanup, used for most modules """
self.info()
+ srv = self.get_server()
+ if srv and local_net(srv):
+ try:
+ lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+ except CommandError, e:
+ log(self.module_name, "disconnect failed: ", self.name)
+ e.dump()
try:
lctl.cleanup(self.name, self.uuid)
- except CommandError:
- print "cleanup failed: ", self.name
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
- def add_module(self, modname):
+ def add_module(self, dev_dir, modname):
"""Append a module to list of modules to load."""
- self.kmodule_list.append(modname)
+ self.kmodule_list.append((dev_dir, modname))
def mod_loaded(self, modname):
"""Check if a module is already loaded. Look in /proc/modules for it."""
def load_module(self):
"""Load all the modules in the list in the order they appear."""
- for mod in self.kmodule_list:
+ for dev_dir, mod in self.kmodule_list:
# (rc, out) = run ('/sbin/lsmod | grep -s', mod)
if self.mod_loaded(mod) and not config.noexec():
continue
+ log ('loading module:', mod)
if config.src_dir():
- module = find_module(config.src_dir(), mod)
+ module = find_module(config.src_dir(),dev_dir, mod)
if not module:
panic('module not found:', mod)
(rc, out) = run('/sbin/insmod', module)
if rc:
- raise CommandError("insmod failed:", module)
+ raise CommandError('insmod', out, rc)
else:
(rc, out) = run('/sbin/modprobe', mod)
if rc:
- raise CommandError("modprobe failed:", module)
+ raise CommandError('modprobe', out, rc)
def cleanup_module(self):
"""Unload the modules in the list in reverse order."""
rev = self.kmodule_list
rev.reverse()
- for mod in rev:
- debug('rmmod', mod)
+ for dev_dir, mod in rev:
+ if not self.mod_loaded(mod):
+ continue
+ # debug hack
+ if mod == 'portals' and config.dump_file():
+ lctl.dump(config.dump_file())
+ log('unloading module:', mod)
if config.noexec():
continue
- run('/sbin/rmmod', mod)
+ (rc, out) = run('/sbin/rmmod', mod)
+ if rc:
+ log('! unable to unload module:', mod)
+ logall(out)
class Network(Module):
- def __init__(self,node):
- Module.__init__(self, 'NETWORK', node)
- self.net_type = node.getAttribute('type')
- self.nid = getText(node, 'server', "")
- self.port = int(getText(node, 'port', 0))
- self.send_buf = int(getText(node, 'send_buf', 0))
- self.read_buf = int(getText(node, 'read_buf', 0))
-
- self.add_module('portals')
+ def __init__(self,dom_node):
+ Module.__init__(self, 'NETWORK', dom_node)
+ self.net_type = get_attr(dom_node,'type')
+ self.nid = get_text(dom_node, 'server', '*')
+ self.port = get_text_int(dom_node, 'port', 0)
+ self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
+ self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
+ if '*' in self.nid:
+ self.nid = get_local_address(self.net_type, self.nid)
+ if not self.nid:
+ panic("unable to set nid for", self.net_type, self.nid)
+ debug("nid:", self.nid)
+
+ self.add_module('portals/linux/oslib/', 'portals')
+ if node_needs_router():
+ self.add_module('portals/linux/router', 'kptlrouter')
if self.net_type == 'tcp':
- self.add_module('ksocknal')
+ self.add_module('portals/linux/socknal', 'ksocknal')
if self.net_type == 'elan':
- self.add_module('kqswnal')
+ self.add_module('portals/linux/rqswnal', 'kqswnal')
if self.net_type == 'gm':
- self.add_module('kgmnal')
- self.add_module('obdclass')
- self.add_module('ptlrpc')
+ self.add_module('portals/linux/gmnal', 'kgmnal')
+ self.add_module('lustre/obdclass', 'obdclass')
+ self.add_module('lustre/ptlrpc', 'ptlrpc')
def prepare(self):
self.info(self.net_type, self.nid, self.port)
if self.net_type == 'tcp':
- ret = run_daemon(TCP_ACCEPTOR, self.port)
+ ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
if ret:
- print "error:", ret
- raise CommandError, "cannot run acceptor"
+ raise CommandError(TCP_ACCEPTOR, out, ret)
+ ret = self.dom_node.getElementsByTagName('route_tbl')
+ for a in ret:
+ for r in a.getElementsByTagName('route'):
+ net_type = get_attr(r, 'type')
+ gw = get_attr(r, 'gw')
+ lo = get_attr(r, 'lo')
+ hi = get_attr(r,'hi', '')
+ lctl.add_route(net_type, gw, lo, hi)
+ if self.net_type == 'tcp' and hi == '':
+ srv = nid2server(self.dom_node.parentNode.parentNode, lo)
+ if not srv:
+ panic("no server for nid", lo)
+ else:
+ lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
+
+
lctl.network(self.net_type, self.nid)
lctl.newdev(attach = "ptlrpc RPCDEV")
def cleanup(self):
self.info(self.net_type, self.nid, self.port)
+ ret = self.dom_node.getElementsByTagName('route_tbl')
+ for a in ret:
+ for r in a.getElementsByTagName('route'):
+ lo = get_attr(r, 'lo')
+ hi = get_attr(r,'hi', '')
+ if self.net_type == 'tcp' and hi == '':
+ srv = nid2server(self.dom_node.parentNode.parentNode, lo)
+ if not srv:
+ panic("no server for nid", lo)
+ else:
+ try:
+ lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+ except CommandError, e:
+ print "disconnect failed: ", self.name
+ e.dump()
+ try:
+ lctl.del_route(self.net_type, self.nid, lo, hi)
+ except CommandError, e:
+ print "del_route failed: ", self.name
+ e.dump()
+
try:
lctl.cleanup("RPCDEV", "")
- lctl.disconnectAll(self.net_type)
- except CommandError:
+ except CommandError, e:
print "cleanup failed: ", self.name
+ e.dump()
+ try:
+ lctl.disconnectAll(self.net_type)
+ except CommandError, e:
+ print "disconnectAll failed: ", self.name
+ e.dump()
if self.net_type == 'tcp':
# yikes, this ugly! need to save pid in /var/something
run("killall acceptor")
class LDLM(Module):
- def __init__(self,node):
- Module.__init__(self, 'LDLM', node)
- self.add_module('ldlm')
- self.add_module('extN') # yuck, fix dupe handling and move this
+ def __init__(self,dom_node):
+ Module.__init__(self, 'LDLM', dom_node)
+ self.add_module('lustre/ldlm', 'ldlm')
def prepare(self):
self.info()
lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
setup ="")
class LOV(Module):
- def __init__(self,node):
- Module.__init__(self, 'LOV', node)
- devs = node.getElementsByTagName('devices')[0]
- self.stripe_sz = int(devs.getAttribute('stripesize'))
- self.stripe_off = int(devs.getAttribute('stripeoffset'))
- self.pattern = int(devs.getAttribute('pattern'))
- mdsref = node.getElementsByTagName('mds_ref')[0]
- self.mdsuuid = mdsref.getAttribute('uuidref')
- mds= lookup(node.parentNode, self.mdsuuid)
- self.mdsname = getName(mds)
- devlist = ""
- stripe_cnt = 0
- for child in devs.childNodes:
- if child.nodeName == 'osc_ref':
- devlist = devlist + child.getAttribute('uuidref') + " "
- stripe_cnt = stripe_cnt + 1
- self.devlist = devlist
- self.stripe_cnt = stripe_cnt
- self.add_module('osc')
- self.add_module('lov')
+ def __init__(self,dom_node):
+ Module.__init__(self, 'LOV', dom_node)
+ self.mds_uuid = get_first_ref(dom_node, 'mds')
+ mds= lookup(dom_node.parentNode, self.mds_uuid)
+ self.mds_name = getName(mds)
+ devs = dom_node.getElementsByTagName('devices')
+ if len(devs) > 0:
+ dev_node = devs[0]
+ self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
+ self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
+ self.pattern = get_attr_int(dev_node, 'pattern', 0)
+ self.devlist = get_all_refs(dev_node, 'osc')
+ self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
+ self.add_module('lustre/mdc', 'mdc')
+ self.add_module('lustre/lov', 'lov')
+
+ def prepare(self):
+ for osc_uuid in self.devlist:
+ osc = lookup(self.dom_node.parentNode, osc_uuid)
+ if osc:
+ n = OSC(osc)
+ n.prepare()
+ else:
+ panic('osc not found:', osc_uuid)
+ mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+ self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
+ self.stripe_off, self.pattern, self.devlist, self.mds_name)
+ lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
+ setup ="%s" % (mdc_uuid))
+ def cleanup(self):
+ for osc_uuid in self.devlist:
+ osc = lookup(self.dom_node.parentNode, osc_uuid)
+ if osc:
+ n = OSC(osc)
+ n.cleanup()
+ else:
+ panic('osc not found:', osc_uuid)
+ Module.cleanup(self)
+ cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+ def load_module(self):
+ for osc_uuid in self.devlist:
+ osc = lookup(self.dom_node.parentNode, osc_uuid)
+ if osc:
+ n = OSC(osc)
+ n.load_module()
+ break
+ else:
+ panic('osc not found:', osc_uuid)
+ Module.load_module(self)
+ def cleanup_module(self):
+ Module.cleanup_module(self)
+ for osc_uuid in self.devlist:
+ osc = lookup(self.dom_node.parentNode, osc_uuid)
+ if osc:
+ n = OSC(osc)
+ n.cleanup_module()
+ break
+ else:
+ panic('osc not found:', osc_uuid)
+
+class LOVConfig(Module):
+ def __init__(self,dom_node):
+ Module.__init__(self, 'LOVConfig', dom_node)
+ self.lov_uuid = get_first_ref(dom_node, 'lov')
+ l = lookup(dom_node.parentNode, self.lov_uuid)
+ self.lov = LOV(l)
+
def prepare(self):
- self.info(self.mdsuuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern,
- self.devlist, self.mdsname)
- lctl.lovconfig(self.uuid, self.mdsname, self.stripe_cnt,
- self.stripe_sz, self.stripe_off, self.pattern,
- self.devlist)
+ lov = self.lov
+ self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern,
+ lov.devlist, lov.mds_name)
+ lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
+ lov.stripe_sz, lov.stripe_off, lov.pattern,
+ string.join(lov.devlist))
def cleanup(self):
+ #nothing to do here
pass
+
class MDS(Module):
- def __init__(self,node):
- Module.__init__(self, 'MDS', node)
- self.devname, self.size = getDevice(node)
- self.fstype = getText(node, 'fstype')
- self.format = getText(node, 'autoformat', "no")
- self.add_module('mds')
- self.add_module('mds_%s' % (self.fstype))
+ def __init__(self,dom_node):
+ Module.__init__(self, 'MDS', dom_node)
+ self.devname, self.size = get_device(dom_node)
+ self.fstype = get_text(dom_node, 'fstype')
+ self.format = get_text(dom_node, 'autoformat', "no")
+ if self.fstype == 'extN':
+ self.add_module('lustre/extN', 'extN')
+ self.add_module('lustre/mds', 'mds')
+ self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
def prepare(self):
self.info(self.devname, self.fstype, self.format)
Module.cleanup(self)
clean_loop(self.devname)
+# Very unusual case, as there is no MDC element in the XML anymore
+# Builds itself from an MDS node
class MDC(Module):
- def __init__(self,node):
- Module.__init__(self, 'MDC', node)
- ref = node.getElementsByTagName('mds_ref')[0]
- self.mds_uuid = ref.getAttribute('uuidref')
- self.add_module('mdc')
+ def __init__(self,dom_node):
+ self.mds = MDS(dom_node)
+ self.dom_node = dom_node
+ self.module_name = 'MDC'
+ self.kmodule_list = []
+ self._server = None
+ self._connected = 0
+
+ host = socket.gethostname()
+ self.name = 'MDC_'+host
+ self.uuid = self.name+'_UUID'
+
+ self.lookup_server(self.mds.uuid)
+ self.add_module('lustre/mdc', 'mdc')
def prepare(self):
- self.info(self.mds_uuid)
- mds = lookup(self.dom_node.parentNode, self.mds_uuid)
- if mds == None:
- panic(self.mdsuuid, "not found.")
- net = get_ost_net(self.dom_node.parentNode, self.mds_uuid)
- srv = Network(net)
- lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_buf, srv.read_buf)
+ self.info(self.mds.uuid)
+ srv = self.get_server()
+ lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
- setup ="%s %s" %(self.mds_uuid, srv.uuid))
+ setup ="%s %s" %(self.mds.uuid, srv.uuid))
- def cleanup(self):
- self.info(self.mds_uuid)
- net = get_ost_net(self.dom_node.parentNode, self.mds_uuid)
- srv = Network(net)
- try:
- lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
- lctl.cleanup(self.name, self.uuid)
- except CommandError:
- print "cleanup failed: ", self.name
-
class OBD(Module):
- def __init__(self, node):
- Module.__init__(self, 'OBD', node)
- self.obdtype = node.getAttribute('type')
- self.devname, self.size = getDevice(node)
- self.fstype = getText(node, 'fstype')
- self.format = getText(node, 'autoformat', 'yes')
- self.add_module(self.obdtype)
+ def __init__(self, dom_node):
+ Module.__init__(self, 'OBD', dom_node)
+ self.obdtype = get_attr(dom_node, 'type')
+ self.devname, self.size = get_device(dom_node)
+ self.fstype = get_text(dom_node, 'fstype')
+ self.format = get_text(dom_node, 'autoformat', 'yes')
+ if self.fstype == 'extN':
+ self.add_module('lustre/extN', 'extN')
+ self.add_module('lustre/' + self.obdtype, self.obdtype)
# need to check /proc/mounts and /etc/mtab before
# formatting anything.
# FIXME: check if device is already formatted.
def prepare(self):
self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
- blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
+ if self.obdtype == 'obdecho':
+ blkdev = ''
+ else:
+ blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
setup ="%s %s" %(blkdev, self.fstype))
def cleanup(self):
Module.cleanup(self)
- clean_loop(self.devname)
+ if not self.obdtype == 'obdecho':
+ clean_loop(self.devname)
class OST(Module):
- def __init__(self,node):
- Module.__init__(self, 'OST', node)
- ref = node.getElementsByTagName('obd_ref')[0]
- self.obd_uuid = ref.getAttribute('uuidref')
- self.add_module('ost')
+ def __init__(self,dom_node):
+ Module.__init__(self, 'OST', dom_node)
+ self.obd_uuid = get_first_ref(dom_node, 'obd')
+ self.add_module('lustre/ost', 'ost')
def prepare(self):
self.info(self.obd_uuid)
lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
setup ="%s" % (self.obd_uuid))
+
+# virtual interface for OSC and LOV
+class VOSC(Module):
+ def __init__(self,dom_node):
+ Module.__init__(self, 'VOSC', dom_node)
+ if dom_node.nodeName == 'lov':
+ self.osc = LOV(dom_node)
+ else:
+ self.osc = OSC(dom_node)
+ def prepare(self):
+ self.osc.prepare()
+ def cleanup(self):
+ self.osc.cleanup()
+ def load_module(self):
+ self.osc.load_module()
+ def cleanup_module(self):
+ self.osc.cleanup_module()
+
+
class OSC(Module):
- def __init__(self,node):
- Module.__init__(self, 'OSC', node)
- ref = node.getElementsByTagName('obd_ref')[0]
- self.obd_uuid = ref.getAttribute('uuidref')
- ref = node.getElementsByTagName('ost_ref')[0]
- self.ost_uuid = ref.getAttribute('uuidref')
- self.add_module('osc')
+ def __init__(self,dom_node):
+ Module.__init__(self, 'OSC', dom_node)
+ self.obd_uuid = get_first_ref(dom_node, 'obd')
+ self.ost_uuid = get_first_ref(dom_node, 'ost')
+ self.lookup_server(self.ost_uuid)
+ self.add_module('lustre/osc', 'osc')
def prepare(self):
self.info(self.obd_uuid, self.ost_uuid)
- net = get_ost_net(self.dom_node.parentNode, self.ost_uuid)
- srv = Network(net)
- lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_buf, srv.read_buf)
+ srv = self.get_server()
+ if local_net(srv):
+ lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
+ else:
+ r = find_route(srv)
+ if r:
+ lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
+ else:
+ panic ("no route to", srv.nid)
+
lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
setup ="%s %s" %(self.obd_uuid, srv.uuid))
def cleanup(self):
- self.info(self.obd_uuid, self.ost_uuid)
- net_uuid = get_ost_net(self.dom_node.parentNode, self.ost_uuid)
- srv = Network(net_uuid)
- try:
- lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
- lctl.cleanup(self.name, self.uuid)
- except CommandError:
- print "cleanup failed: ", self.name
+ srv = self.get_server()
+ if local_net(srv):
+ Module.cleanup(self)
+ else:
+ self.info(self.obd_uuid, self.ost_uuid)
+ r = find_route(srv)
+ if r:
+ lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
+ Module.cleanup(self)
+
class Mountpoint(Module):
- def __init__(self,node):
- Module.__init__(self, 'MTPT', node)
- self.path = getText(node, 'path')
- ref = node.getElementsByTagName('mdc_ref')[0]
- self.mdc_uuid = ref.getAttribute('uuidref')
- ref = node.getElementsByTagName('osc_ref')[0]
- self.lov_uuid = ref.getAttribute('uuidref')
- self.add_module('osc')
- self.add_module('llite')
+ def __init__(self,dom_node):
+ Module.__init__(self, 'MTPT', dom_node)
+ self.path = get_text(dom_node, 'path')
+ self.mds_uuid = get_first_ref(dom_node, 'mds')
+ self.lov_uuid = get_first_ref(dom_node, 'osc')
+ self.add_module('lustre/mdc', 'mdc')
+ self.add_module('lustre/llite', 'llite')
+ l = lookup(self.dom_node.parentNode, self.lov_uuid)
+ self.osc = VOSC(l)
def prepare(self):
- l = lookup(self.dom_node.parentNode, self.lov_uuid)
- if l.nodeName == 'lov':
- lov = LOV(l)
- for osc_uuid in string.split(lov.devlist):
- osc = lookup(self.dom_node.parentNode, osc_uuid)
- if osc:
- n = OSC(osc)
- n.prepare()
- else:
- panic('osc not found:', osc_uuid)
- lctl.newdev(attach="lov %s %s" % (lov.name, lov.uuid),
- setup ="%s" % (self.mdc_uuid))
- else:
- osc = OSC(l)
- osc.prepare()
-
- self.info(self.path, self.mdc_uuid,self.lov_uuid)
+ self.osc.prepare()
+ mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+
+ self.info(self.path, self.mds_uuid,self.lov_uuid)
cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
- (self.lov_uuid, self.mdc_uuid, self.path)
+ (self.lov_uuid, mdc_uuid, self.path)
run("mkdir", self.path)
ret, val = run(cmd)
if ret:
panic("mount failed:", self.path)
+
def cleanup(self):
- self.info(self.path, self.mdc_uuid,self.lov_uuid)
- run("umount", self.path)
+ self.info(self.path, self.mds_uuid,self.lov_uuid)
+ (rc, out) = run("umount", self.path)
+ if rc:
+ log("umount failed, cleanup will most likely not work.")
l = lookup(self.dom_node.parentNode, self.lov_uuid)
- if l.nodeName == 'lov':
- lov = LOV(l)
- for osc_uuid in string.split(lov.devlist):
- osc = lookup(self.dom_node.parentNode, osc_uuid)
- if osc:
- n = OSC(osc)
- n.cleanup()
- else:
- panic('osc not found:', osc_uuid)
- lov.cleanup()
- else:
- osc = OSC(l)
- osc.cleanup()
-
+ self.osc.cleanup()
+ cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+
+ def load_module(self):
+ self.osc.load_module()
+ Module.load_module(self)
+ def cleanup_module(self):
+ Module.cleanup_module(self)
+ self.osc.cleanup_module()
+
+
# ============================================================
# XML processing and query
# TODO: Change query funcs to use XPath, which is muc cleaner
-def getDevice(obd):
- dev = obd.getElementsByTagName('device')[0]
- dev.normalize();
- try:
- size = int(dev.getAttribute('size'))
- except ValueError:
- size = 0
- return dev.firstChild.data, size
+def get_device(obd):
+ list = obd.getElementsByTagName('device')
+ if len(list) > 0:
+ dev = list[0]
+ dev.normalize();
+ size = get_attr_int(dev, 'size', 0)
+ return dev.firstChild.data, size
+ return '', 0
# Get the text content from the first matching child
-def getText(node, tag, default=""):
- list = node.getElementsByTagName(tag)
+# If there is no content (or it is all whitespace), return
+# the default
+def get_text(dom_node, tag, default=""):
+ list = dom_node.getElementsByTagName(tag)
if len(list) > 0:
- node = list[0]
- node.normalize()
- return node.firstChild.data
- else:
- return default
-
-def get_ost_net(node, uuid):
- ost = lookup(node, uuid)
- list = ost.getElementsByTagName('network_ref')
- if list:
- uuid = list[0].getAttribute('uuidref')
- else:
+ dom_node = list[0]
+ dom_node.normalize()
+ if dom_node.firstChild:
+ txt = string.strip(dom_node.firstChild.data)
+ if txt:
+ return txt
+ return default
+
+def get_text_int(dom_node, tag, default=0):
+ list = dom_node.getElementsByTagName(tag)
+ n = default
+ if len(list) > 0:
+ dom_node = list[0]
+ dom_node.normalize()
+ if dom_node.firstChild:
+ txt = string.strip(dom_node.firstChild.data)
+ if txt:
+ try:
+ n = int(txt)
+ except ValueError:
+ panic("text value is not integer:", txt)
+ return n
+
+def get_attr(dom_node, attr, default=""):
+ v = dom_node.getAttribute(attr)
+ if v:
+ return v
+ return default
+
+def get_attr_int(dom_node, attr, default=0):
+ n = default
+ v = dom_node.getAttribute(attr)
+ if v:
+ try:
+ n = int(v)
+ except ValueError:
+ panic("attr value is not integer", v)
+ return n
+
+def get_first_ref(dom_node, tag):
+ """ Get the first uuidref of the type TAG. Used one only
+ one is expected. Returns the uuid."""
+ uuid = None
+ refname = '%s_ref' % tag
+ list = dom_node.getElementsByTagName(refname)
+ if len(list) > 0:
+ uuid = getRef(list[0])
+ return uuid
+
+def get_all_refs(dom_node, tag):
+ """ Get all the refs of type TAG. Returns list of uuids. """
+ uuids = []
+ refname = '%s_ref' % tag
+ list = dom_node.getElementsByTagName(refname)
+ if len(list) > 0:
+ for i in list:
+ uuids.append(getRef(i))
+ return uuids
+
+def get_ost_net(dom_node, uuid):
+ ost = lookup(dom_node, uuid)
+ uuid = get_first_ref(ost, 'network')
+ if not uuid:
return None
- return lookup(node, uuid)
+ return lookup(dom_node, uuid)
+
+def nid2server(dom_node, nid):
+ netlist = dom_node.getElementsByTagName('network')
+ for net_node in netlist:
+ if get_text(net_node, 'server') == nid:
+ return Network(net_node)
+ return None
-def lookup(node, uuid):
- for n in node.childNodes:
+def lookup(dom_node, uuid):
+ for n in dom_node.childNodes:
if n.nodeType == n.ELEMENT_NODE:
if getUUID(n) == uuid:
return n
if n: return n
return None
-# Get name attribute of node
-def getName(node):
- return node.getAttribute('name')
+# Get name attribute of dom_node
+def getName(dom_node):
+ return dom_node.getAttribute('name')
-def getRef(node):
- return node.getAttribute('uuidref')
+def getRef(dom_node):
+ return dom_node.getAttribute('uuidref')
-# Get name attribute of node
-def getUUID(node):
- return node.getAttribute('uuid')
+# Get name attribute of dom_node
+def getUUID(dom_node):
+ return dom_node.getAttribute('uuid')
# the tag name is the service type
-# fixme: this should do some checks to make sure the node is a service
-def getServiceType(node):
- return node.nodeName
+# fixme: this should do some checks to make sure the dom_node is a service
+def getServiceType(dom_node):
+ return dom_node.nodeName
#
# determine what "level" a particular node is at.
-# the order of iniitailization is based on level. objects
-# are assigned a level based on type:
-# net,devices,ldlm:1, obd, mdd:2 mds,ost:3 osc,mdc:4 mounts:5
-def getServiceLevel(node):
- type = getServiceType(node)
+# the order of iniitailization is based on level.
+def getServiceLevel(dom_node):
+ type = getServiceType(dom_node)
if type in ('network',):
- return 1
- if type in ('device', 'ldlm'):
- return 2
+ return 10
+ elif type in ('device', 'ldlm'):
+ return 20
elif type in ('obd', 'mdd'):
- return 3
+ return 30
elif type in ('mds','ost'):
- return 4
+ return 40
elif type in ('mdc','osc'):
- return 5
- elif type in ('lov',):
- return 6
+ return 50
+ elif type in ('lov', 'lovconfig'):
+ return 60
elif type in ('mountpoint',):
- return 7
+ return 70
return 0
#
# return list of services in a profile. list is a list of tuples
-# [(level, node),]
+# [(level, dom_node),]
def getServices(lustreNode, profileNode):
list = []
for n in profileNode.childNodes:
list.sort()
return list
-def getByName(lustreNode, tag, name):
+def getByName(lustreNode, name, tag):
ndList = lustreNode.getElementsByTagName(tag)
for nd in ndList:
if getName(nd) == name:
return None
-# ============================================================
+############################################################
+# MDC UUID hack -
+# FIXME: clean this mess up!
+#
+mdc_uuid = None
+def prepare_mdc(dom_node, mds_uuid):
+ global mdc_uuid
+ mds_node = lookup(dom_node, mds_uuid);
+ if not mds_node:
+ panic("no mds:", mds_uuid)
+ if mdc_uuid:
+ return mdc_uuid
+ mdc = MDC(mds_node)
+ mdc.prepare()
+ mdc_uuid = mdc.uuid
+ return mdc_uuid
+
+mdc_cleaned = None
+def cleanup_mdc(dom_node, mds_uuid):
+ global mdc_cleaned
+ mds_node = lookup(dom_node, mds_uuid);
+ if not mds_node:
+ panic("no mds:", mds_uuid)
+ if not mdc_cleaned:
+ mdc = MDC(mds_node)
+ mdc.cleanup()
+ mdc_uuid = None
+ mdc_cleaned = 'yes'
+
+
+############################################################
+# routing ("rooting")
+#
+routes = []
+local_node = []
+router_flag = 0
+
+def init_node(dom_node):
+ global local_node, router_flag
+ netlist = dom_node.getElementsByTagName('network')
+ for dom_net in netlist:
+ type = get_attr(dom_net, 'type')
+ gw = get_text(dom_net, 'server')
+ local_node.append((type, gw))
+
+def node_needs_router():
+ return router_flag
+
+def get_routes(type, gw, dom_net):
+ """ Return the routes as a list of tuples of the form:
+ [(type, gw, lo, hi),]"""
+ res = []
+ tbl = dom_net.getElementsByTagName('route_tbl')
+ for t in tbl:
+ routes = t.getElementsByTagName('route')
+ for r in routes:
+ lo = get_attr(r, 'lo')
+ hi = get_attr(r, 'hi', '')
+ res.append((type, gw, lo, hi))
+ return res
+
+
+def init_route_config(lustre):
+ """ Scan the lustre config looking for routers. Build list of
+ routes. """
+ global routes, router_flag
+ routes = []
+ list = lustre.getElementsByTagName('node')
+ for node in list:
+ if get_attr(node, 'router'):
+ router_flag = 1
+ for (local_type, local_nid) in local_node:
+ gw = None
+ netlist = node.getElementsByTagName('network')
+ for dom_net in netlist:
+ if local_type == get_attr(dom_net, 'type'):
+ gw = get_text(dom_net, 'server')
+ break
+ if not gw:
+ continue
+ for dom_net in netlist:
+ if local_type != get_attr(dom_net, 'type'):
+ for route in get_routes(local_type, gw, dom_net):
+ routes.append(route)
+
+
+def local_net(net):
+ global local_node
+ for iface in local_node:
+ if net.net_type == iface[0]:
+ return 1
+ return 0
+
+def find_route(net):
+ global local_node, routes
+ frm_type = local_node[0][0]
+ to_type = net.net_type
+ to = net.nid
+ debug ('looking for route to', to_type,to)
+ for r in routes:
+ if r[2] == to:
+ return r
+ return None
+
+
+
+
+############################################################
# lconf level logic
# Start a service.
-def startService(node, clean_flag, module_flag):
- type = getServiceType(node)
- debug('Service:', type, getName(node), getUUID(node))
+def startService(dom_node, module_flag):
+ type = getServiceType(dom_node)
+ debug('Service:', type, getName(dom_node), getUUID(dom_node))
# there must be a more dynamic way of doing this...
n = None
if type == 'ldlm':
- n = LDLM(node)
+ n = LDLM(dom_node)
elif type == 'lov':
- n = LOV(node)
+ n = LOV(dom_node)
+ elif type == 'lovconfig':
+ n = LOVConfig(dom_node)
elif type == 'network':
- n = Network(node)
+ n = Network(dom_node)
elif type == 'obd':
- n = OBD(node)
+ n = OBD(dom_node)
elif type == 'ost':
- n = OST(node)
+ n = OST(dom_node)
elif type == 'mds':
- n = MDS(node)
+ n = MDS(dom_node)
elif type == 'osc':
- n = OSC(node)
+ n = VOSC(dom_node)
elif type == 'mdc':
- n = MDC(node)
+ n = MDC(dom_node)
elif type == 'mountpoint':
- n = Mountpoint(node)
+ n = Mountpoint(dom_node)
else:
panic ("unknown service type:", type)
if module_flag:
if config.nomod():
return
- if clean_flag:
+ if config.cleanup():
n.cleanup_module()
else:
n.load_module()
else:
if config.nosetup():
return
- if clean_flag:
+ if config.cleanup():
n.cleanup()
else:
n.prepare()
# * make sure partitions are in place and prepared
# * initialize devices with lctl
# Levels is important, and needs to be enforced.
-def startProfile(lustreNode, profileNode, clean_flag, module_flag):
+def startProfile(lustreNode, profileNode, module_flag):
if not profileNode:
panic("profile:", profile, "not found.")
services = getServices(lustreNode, profileNode)
- if clean_flag:
+ if config.cleanup():
services.reverse()
for s in services:
- startService(s[1], clean_flag, module_flag)
+ startService(s[1], module_flag)
+
#
# Load profile for
-def doHost(lustreNode, hosts, clean_flag):
- node = None
+def doHost(lustreNode, hosts):
+ global routes
+ dom_node = None
for h in hosts:
- node = getByName(lustreNode, 'node', h)
- if node:
+ dom_node = getByName(lustreNode, h, 'node')
+ if dom_node:
break
- if not node:
+ if not dom_node:
print 'No host entry found.'
return
+ if not get_attr(dom_node, 'router'):
+ init_node(dom_node)
+ init_route_config(lustreNode)
+ else:
+ global router_flag
+ router_flag = 1
+
# Two step process: (1) load modules, (2) setup lustre
# if not cleaning, load modules first.
- module_flag = not clean_flag
- reflist = node.getElementsByTagName('profile')
+ module_flag = not config.cleanup()
+ reflist = dom_node.getElementsByTagName('profile')
for profile in reflist:
- startProfile(lustreNode, profile, clean_flag, module_flag)
+ startProfile(lustreNode, profile, module_flag)
- if not clean_flag:
- setDebugPath()
+ if not config.cleanup():
+ sys_set_debug_path()
script = config.gdb_script()
run(lctl.lctl, ' modules >', script)
if config.gdb():
module_flag = not module_flag
for profile in reflist:
- startProfile(lustreNode, profile, clean_flag, module_flag)
+ startProfile(lustreNode, profile, module_flag)
+############################################################
# Command line processing
#
def parse_cmdline(argv):
- short_opts = "hdv"
+ short_opts = "hdnv"
long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
- "portals=", "makeldiff", "cleanup",
- "help", "debug", "node=", "get=", "nomod", "nosetup"]
+ "portals=", "makeldiff", "cleanup", "noexec",
+ "help", "node=", "get=", "nomod", "nosetup",
+ "dump="]
opts = []
args = []
try:
opts, args = getopt.getopt(argv, short_opts, long_opts)
- except getopt.GetoptError:
+ except getopt.error:
print "invalid opt"
usage()
for o, a in opts:
if o in ("-h", "--help"):
usage()
- if o == "--cleanup":
+ if o in ("-d","--cleanup"):
config.cleanup(1)
if o in ("-v", "--verbose"):
config.verbose(1)
- if o in ("-d", "--debug"):
+ if o in ("-n", "--noexec"):
config.noexec(1)
config.verbose(1)
if o == "--portals":
config.nomod(1)
if o == "--nosetup":
config.nosetup(1)
+ if o == "--dump":
+ config.dump_file(a)
return args
def fetch(url):
if os.access(base+"/Makefile", os.R_OK):
config.src_dir(base + "/../../")
-def setDebugPath():
+def sys_set_debug_path():
debug("debug path: ", config.debug_path())
- fp = open('/proc/sys/portals/debug_path', 'w')
- fp.write(config.debug_path())
- fp.close()
+ if config.noexec():
+ return
+ try:
+ fp = open('/proc/sys/portals/debug_path', 'w')
+ fp.write(config.debug_path())
+ fp.close()
+ except IOError, e:
+ print e
+#/proc/sys/net/core/rmem_max
+#/proc/sys/net/core/wmem_max
+def sys_set_netmem_max(path, max):
+ debug("setting", path, "to at least", max)
+ if config.noexec():
+ return
+ fp = open(path)
+ str = fp.readline()
+ fp.close
+ cur = int(str)
+ if max > cur:
+ fp = open(path, 'w')
+ fp.write('%d\n' %(max))
+ fp.close()
+
-def makeDevices():
+def sys_make_devices():
if not os.access('/dev/portals', os.R_OK):
run('mknod /dev/portals c 10 240')
if not os.access('/dev/obd', os.R_OK):
# Shutdown does steps in reverse
#
def main():
- global TCP_ACCEPTOR, lctl
+ global TCP_ACCEPTOR, lctl, MAXTCPBUF
+ host = socket.gethostname()
+
args = parse_cmdline(sys.argv[1:])
if len(args) > 0:
if not os.access(args[0], os.R_OK | os.W_OK):
if config.node():
node_list.append(config.node())
else:
- host = socket.gethostname()
if len(host) > 0:
node_list.append(host)
node_list.append('localhost')
- print "configuring for host: ", node_list
+ debug("configuring for host: ", node_list)
+
+ if len(host) > 0:
+ config._debug_path = config._debug_path + '-' + host
+ config._gdb_script = config._gdb_script + '-' + host
TCP_ACCEPTOR = find_prog('acceptor')
if not TCP_ACCEPTOR:
lctl = LCTLInterface('lctl')
setupModulePath(sys.argv[0])
- makeDevices()
- doHost(dom.documentElement, node_list, config.cleanup())
+ sys_make_devices()
+ sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
+ sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+ doHost(dom.documentElement, node_list)
if __name__ == "__main__":
try:
main()
- except RuntimeError:
- pass
- except CommandError:
- print 'FIXME: insert exception data here'
-
+ except LconfError, e:
+ print e
+ except CommandError, e:
+ e.dump()