X-Git-Url: https://git.whamcloud.com/?a=blobdiff_plain;f=lustre%2Futils%2Flconf;h=9138a4d4e19da673d95f59ba861d78eafff0db83;hb=1400c3218795628f2d774dddbd3379997d006cef;hp=328d418b038f52f036798ec85c22fe593d460ffe;hpb=4c28cec2d7fa0e15355ccb701343d185bd6803b8;p=fs%2Flustre-release.git diff --git a/lustre/utils/lconf b/lustre/utils/lconf index 328d418..9138a4d 100755 --- a/lustre/utils/lconf +++ b/lustre/utils/lconf @@ -32,7 +32,8 @@ import xml.dom.minidom # Global parameters TCP_ACCEPTOR = '' - +MAXTCPBUF = 1048576 +DEFAULT_TCPBUF = 1048576 # # Maximum number of devices to search for. # (the /dev/loop* nodes need to be created beforehand) @@ -44,21 +45,26 @@ def usage(): config.xml Lustre configuration in xml format. --get URL to fetch a config file --v | --verbose Print system commands as they are run --d | --debug Print system commands, but does not run them --node Load config for ---cleanup Cleans up config. (Shutdown) +-d | --cleanup Cleans up config. (Shutdown) +-v | --verbose Print system commands as they are run -h | --help Print this help ---gdb Create a gdb script to load the modules. Prints message - after creating script and sleeps for 5 seconds. +--gdb Prints message after creating gdb module script + and sleeps for 5 seconds. +-n | --noexec Prints the commands and steps that will be run for a + config without executing them. This can used to check if a + config file is doing what it should be doing. (Implies -v) +--nomod Skip load/unload module step. +--nosetup Skip device setup/cleanup step. +--reformat Reformat all devices (without question) """ TODO = """ ---ldap server LDAP server with lustre config database ---reformat Reformat all devices (will confirm) +--ldap server LDAP server with lustre config database +--makeldiff Translate xml source to LDIFF +This are perhaps not needed: --lustre="src dir" Base directory of lustre sources. Used to search for modules. --portals=src Portals source ---makeldiff Translate xml source to LDIFF """ sys.exit() @@ -80,6 +86,7 @@ class Config: self._url = None self._gdb_script = '/tmp/ogdb' self._debug_path = '/tmp/lustre-log' + self._dump_file = None self._src_dir = None def verbose(self, flag = None): @@ -131,8 +138,12 @@ class Config: return self._debug_path def src_dir(self, val = None): - if val: self._url = val - return self._url + if val: self._src_dir = val + return self._src_dir + + def dump_file(self, val = None): + if val: self._dump_file = val + return self._dump_file config = Config() @@ -140,13 +151,14 @@ config = Config() # debugging and error funcs def fixme(msg = "this feature"): - raise RuntimeError, msg + ' not implmemented yet.' + raise LconfError, msg + ' not implmemented yet.' def panic(*args): msg = string.join(map(str,args)) - print msg if not config.noexec(): - raise RuntimeError, msg + raise LconfError(msg) + else: + print "! " + msg def log(*args): msg = string.join(map(str,args)) @@ -164,9 +176,33 @@ def debug(*args): # ============================================================ # locally defined exceptions class CommandError (exceptions.Exception): - def __init__(self, args=None): + def __init__(self, cmd_name, cmd_err, rc=None): + self.cmd_name = cmd_name + self.cmd_err = cmd_err + self.rc = rc + + def dump(self): + import types + if type(self.cmd_err) == types.StringType: + if self.rc: + print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err) + else: + print "! %s: %s" % (self.cmd_name, self.cmd_err) + elif type(self.cmd_err) == types.ListType: + if self.rc: + print "! %s (error %d):" % (self.cmd_name, self.rc) + else: + print "! %s:" % (self.cmd_name) + for s in self.cmd_err: + print "> %s" %(string.strip(s)) + else: + print self.cmd_err + +class LconfError (exceptions.Exception): + def __init__(self, args): self.args = args + # ============================================================ # handle lctl interface class LCTLInterface: @@ -184,7 +220,7 @@ class LCTLInterface: debug('! lctl not found') self.lctl = 'lctl' else: - raise CommandError, "unable to find lctl binary." + raise CommandError('lctl', "unable to find lctl binary.") def run(self, cmds): """ @@ -201,16 +237,13 @@ class LCTLInterface: p.tochild.write(cmds + "\n") p.tochild.close() out = p.fromchild.readlines() - ret = p.poll() - for l in out: - debug('lctl:',string.strip(l)) err = p.childerr.readlines() + ret = p.wait() if ret or len(err): - log (self.lctl, "error:", ret) - logall(err) - raise CommandError, err + raise CommandError(self.lctl, err, ret) return ret, out + def network(self, net, nid): """ initialized network and add "self" """ # Idea: "mynid" could be used for all network types to add "self," and then @@ -229,21 +262,59 @@ class LCTLInterface: self.run(cmds) - # create a new connection - def connect(self, net, nid, port, servuuid, send_buf, read_buf): - # XXX: buf size params not used yet - cmds = """ + # create a new connection + def connect(self, net, nid, port, servuuid, send_mem, recv_mem): + if net == 'tcp': + cmds = """ network %s + add_uuid %s %s + send_mem %d + recv_mem %d connect %s %d + quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, ) + else: + cmds = """ + network %s add_uuid %s %s - quit""" % (net, nid, port, servuuid, nid) + connect %s %d + quit""" % (net, servuuid, nid, nid, port, ) + self.run(cmds) - # create a new connection - def add_route(self, net, to, via): + # add a route to a range + def add_route(self, net, gw, lo, hi): cmds = """ - """ - #self.run(cmds) + network %s + add_route %s %s %s + quit """ % (net, gw, lo, hi) + self.run(cmds) + + + # add a route to a range + def del_route(self, net, gw, lo, hi): + cmds = """ + network %s + del_route %s + quit """ % (net, lo) + self.run(cmds) + + # add a route to a host + def add_route_host(self, net, uuid, gw, tgt): + cmds = """ + network %s + add_uuid %s %s + add_route %s %s + quit """ % (net, uuid, tgt, gw, tgt) + self.run(cmds) + + # add a route to a range + def del_route_host(self, net, uuid, gw, tgt): + cmds = """ + network %s + del_uuid %s + del_route %s + quit """ % (net, uuid, tgt) + self.run(cmds) # disconnect one connection def disconnect(self, net, nid, port, servuuid): @@ -254,12 +325,12 @@ class LCTLInterface: quit""" % (net, nid, servuuid) self.run(cmds) - # disconnect all connections + # disconnect all def disconnectAll(self, net): cmds = """ network %s - disconnect del_uuid self + disconnect quit""" % (net) self.run(cmds) @@ -290,6 +361,13 @@ class LCTLInterface: quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist) self.run(cmds) + # cleanup a device + def dump(self, dump_file): + cmds = """ + debug_kernel %s 1 + quit""" % (dump_file) + self.run(cmds) + # ============================================================ # Various system-level functions # (ideally moved to their own module) @@ -348,16 +426,14 @@ def do_find_file(base, mod): if module: return module -def find_module(src_dir, modname): +def find_module(src_dir, dev_dir, modname): mod = '%s.o' % (modname) - search = (src_dir + "/lustre", src_dir + "/portals") - for d in search: - try: - module = do_find_file(d, mod) - if module: - return module - except OSError: - pass + module = src_dir +'/'+ dev_dir +'/'+ mod + try: + if os.access(module, os.R_OK): + return module + except OSError: + pass return None # is the path a block device? @@ -421,7 +497,7 @@ def init_loop(file, size, fstype): if dev: print 'WARNING file:', file, 'already mapped to', dev return dev - if not os.access(file, os.R_OK | os.W_OK): + if config.reformat() or not os.access(file, os.R_OK | os.W_OK): run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file)) loop = loop_base() # find next free loop @@ -447,15 +523,66 @@ def clean_loop(file): log('unable to clean loop device:', dev, 'for file:', file) logall(out) +# determine if dev is formatted as a filesystem +def need_format(fstype, dev): + # FIXME don't know how to implement this + return 0 + # initialize a block device if needed def block_dev(dev, size, fstype, format): if config.noexec(): return dev if not is_block(dev): dev = init_loop(dev, size, fstype) - if (format == 'yes'): + if config.reformat() or (need_format(fstype, dev) and format == 'yes'): mkfs(fstype, dev) + +# else: +# panic("device:", dev, +# "not prepared, and autoformat is not set.\n", +# "Rerun with --reformat option to format ALL filesystems") + return dev +def if2addr(iface): + """lookup IP address for an interface""" + rc, out = run("/sbin/ifconfig", iface) + if rc or not out: + return None + addr = string.split(out[1])[1] + ip = string.split(addr, ':')[1] + return ip + +def get_local_address(net_type, wildcard): + """Return the local address for the network type.""" + local = "" + if net_type == 'tcp': + if ':' in wildcard: + iface, star = string.split(wildcard, ':') + local = if2addr(iface) + if not local: + panic ("unable to determine ip for:", wildcard) + else: + host = socket.gethostname() + local = socket.gethostbyname(host) + elif net_type == 'elan': + # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position' + try: + fp = open('/proc/elan/device0/position', 'r') + lines = fp.readlines() + fp.close() + for l in lines: + a = string.split(l) + if a[0] == 'NodeId': + local = a[1] + break + except IOError, e: + log(e) + elif net_type == 'gm': + fixme("automatic local address for GM") + return local + + + # ============================================================ # Classes to prepare and cleanup the various objects # @@ -463,28 +590,49 @@ class Module: """ Base class for the rest of the modules. The default cleanup method is defined here, as well as some utilitiy funcs. """ - def __init__(self, tag_name, node): - self.dom_node = node - self.tag_name = tag_name - self.name = node.getAttribute('name') - self.uuid = node.getAttribute('uuid') + def __init__(self, module_name, dom_node): + self.dom_node = dom_node + self.module_name = module_name + self.name = get_attr(dom_node, 'name') + self.uuid = get_attr(dom_node, 'uuid') self.kmodule_list = [] + self._server = None + self._connected = 0 def info(self, *args): msg = string.join(map(str,args)) - print self.tag_name + ":", self.name, self.uuid, msg + print self.module_name + ":", self.name, self.uuid, msg + + + def lookup_server(self, srv_uuid): + """ Lookup a server's network information """ + net = get_ost_net(self.dom_node.parentNode, srv_uuid) + if not net: + panic ("Unable to find a server for:", srv_uuid) + self._server = Network(net) + + def get_server(self): + return self._server def cleanup(self): """ default cleanup, used for most modules """ self.info() + srv = self.get_server() + if srv and local_net(srv): + try: + lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) + except CommandError, e: + log(self.module_name, "disconnect failed: ", self.name) + e.dump() try: lctl.cleanup(self.name, self.uuid) - except CommandError: - print "cleanup failed: ", self.name + except CommandError, e: + log(self.module_name, "cleanup failed: ", self.name) + e.dump() - def add_module(self, modname): + def add_module(self, dev_dir, modname): """Append a module to list of modules to load.""" - self.kmodule_list.append(modname) + self.kmodule_list.append((dev_dir, modname)) def mod_loaded(self, modname): """Check if a module is already loaded. Look in /proc/modules for it.""" @@ -498,123 +646,231 @@ class Module: def load_module(self): """Load all the modules in the list in the order they appear.""" - for mod in self.kmodule_list: + for dev_dir, mod in self.kmodule_list: # (rc, out) = run ('/sbin/lsmod | grep -s', mod) if self.mod_loaded(mod) and not config.noexec(): continue + log ('loading module:', mod) if config.src_dir(): - module = find_module(config.src_dir(), mod) + module = find_module(config.src_dir(),dev_dir, mod) if not module: panic('module not found:', mod) (rc, out) = run('/sbin/insmod', module) if rc: - raise CommandError("insmod failed:", module) + raise CommandError('insmod', out, rc) else: (rc, out) = run('/sbin/modprobe', mod) if rc: - raise CommandError("modprobe failed:", module) + raise CommandError('modprobe', out, rc) def cleanup_module(self): """Unload the modules in the list in reverse order.""" rev = self.kmodule_list rev.reverse() - for mod in rev: - debug('rmmod', mod) + for dev_dir, mod in rev: + if not self.mod_loaded(mod): + continue + # debug hack + if mod == 'portals' and config.dump_file(): + lctl.dump(config.dump_file()) + log('unloading module:', mod) if config.noexec(): continue - run('/sbin/rmmod', mod) + (rc, out) = run('/sbin/rmmod', mod) + if rc: + log('! unable to unload module:', mod) + logall(out) class Network(Module): - def __init__(self,node): - Module.__init__(self, 'NETWORK', node) - self.net_type = node.getAttribute('type') - self.nid = getText(node, 'server', "") - self.port = int(getText(node, 'port', 0)) - self.send_buf = int(getText(node, 'send_buf', 0)) - self.read_buf = int(getText(node, 'read_buf', 0)) - - self.add_module('portals') + def __init__(self,dom_node): + Module.__init__(self, 'NETWORK', dom_node) + self.net_type = get_attr(dom_node,'type') + self.nid = get_text(dom_node, 'server', '*') + self.port = get_text_int(dom_node, 'port', 0) + self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF) + self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF) + if '*' in self.nid: + self.nid = get_local_address(self.net_type, self.nid) + if not self.nid: + panic("unable to set nid for", self.net_type, self.nid) + debug("nid:", self.nid) + + self.add_module('portals/linux/oslib/', 'portals') + if node_needs_router(): + self.add_module('portals/linux/router', 'kptlrouter') if self.net_type == 'tcp': - self.add_module('ksocknal') + self.add_module('portals/linux/socknal', 'ksocknal') if self.net_type == 'elan': - self.add_module('kqswnal') + self.add_module('portals/linux/rqswnal', 'kqswnal') if self.net_type == 'gm': - self.add_module('kgmnal') - self.add_module('obdclass') - self.add_module('ptlrpc') + self.add_module('portals/linux/gmnal', 'kgmnal') + self.add_module('lustre/obdclass', 'obdclass') + self.add_module('lustre/ptlrpc', 'ptlrpc') def prepare(self): self.info(self.net_type, self.nid, self.port) if self.net_type == 'tcp': - ret = run_daemon(TCP_ACCEPTOR, self.port) + ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port) if ret: - print "error:", ret - raise CommandError, "cannot run acceptor" + raise CommandError(TCP_ACCEPTOR, out, ret) + ret = self.dom_node.getElementsByTagName('route_tbl') + for a in ret: + for r in a.getElementsByTagName('route'): + net_type = get_attr(r, 'type') + gw = get_attr(r, 'gw') + lo = get_attr(r, 'lo') + hi = get_attr(r,'hi', '') + lctl.add_route(net_type, gw, lo, hi) + if self.net_type == 'tcp' and hi == '': + srv = nid2server(self.dom_node.parentNode.parentNode, lo) + if not srv: + panic("no server for nid", lo) + else: + lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) + + lctl.network(self.net_type, self.nid) lctl.newdev(attach = "ptlrpc RPCDEV") def cleanup(self): self.info(self.net_type, self.nid, self.port) + ret = self.dom_node.getElementsByTagName('route_tbl') + for a in ret: + for r in a.getElementsByTagName('route'): + lo = get_attr(r, 'lo') + hi = get_attr(r,'hi', '') + if self.net_type == 'tcp' and hi == '': + srv = nid2server(self.dom_node.parentNode.parentNode, lo) + if not srv: + panic("no server for nid", lo) + else: + try: + lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) + except CommandError, e: + print "disconnect failed: ", self.name + e.dump() + try: + lctl.del_route(self.net_type, self.nid, lo, hi) + except CommandError, e: + print "del_route failed: ", self.name + e.dump() + try: lctl.cleanup("RPCDEV", "") - lctl.disconnectAll(self.net_type) - except CommandError: + except CommandError, e: print "cleanup failed: ", self.name + e.dump() + try: + lctl.disconnectAll(self.net_type) + except CommandError, e: + print "disconnectAll failed: ", self.name + e.dump() if self.net_type == 'tcp': # yikes, this ugly! need to save pid in /var/something run("killall acceptor") class LDLM(Module): - def __init__(self,node): - Module.__init__(self, 'LDLM', node) - self.add_module('ldlm') - self.add_module('extN') # yuck, fix dupe handling and move this + def __init__(self,dom_node): + Module.__init__(self, 'LDLM', dom_node) + self.add_module('lustre/ldlm', 'ldlm') def prepare(self): self.info() lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid), setup ="") class LOV(Module): - def __init__(self,node): - Module.__init__(self, 'LOV', node) - devs = node.getElementsByTagName('devices')[0] - self.stripe_sz = int(devs.getAttribute('stripesize')) - self.stripe_off = int(devs.getAttribute('stripeoffset')) - self.pattern = int(devs.getAttribute('pattern')) - mdsref = node.getElementsByTagName('mds_ref')[0] - self.mdsuuid = mdsref.getAttribute('uuidref') - mds= lookup(node.parentNode, self.mdsuuid) - self.mdsname = getName(mds) - devlist = "" - stripe_cnt = 0 - for child in devs.childNodes: - if child.nodeName == 'osc_ref': - devlist = devlist + child.getAttribute('uuidref') + " " - stripe_cnt = stripe_cnt + 1 - self.devlist = devlist - self.stripe_cnt = stripe_cnt - self.add_module('osc') - self.add_module('lov') + def __init__(self,dom_node): + Module.__init__(self, 'LOV', dom_node) + self.mds_uuid = get_first_ref(dom_node, 'mds') + mds= lookup(dom_node.parentNode, self.mds_uuid) + self.mds_name = getName(mds) + devs = dom_node.getElementsByTagName('devices') + if len(devs) > 0: + dev_node = devs[0] + self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536) + self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0) + self.pattern = get_attr_int(dev_node, 'pattern', 0) + self.devlist = get_all_refs(dev_node, 'osc') + self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist)) + self.add_module('lustre/mdc', 'mdc') + self.add_module('lustre/lov', 'lov') + + def prepare(self): + for osc_uuid in self.devlist: + osc = lookup(self.dom_node.parentNode, osc_uuid) + if osc: + n = OSC(osc) + n.prepare() + else: + panic('osc not found:', osc_uuid) + mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid) + self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, + self.stripe_off, self.pattern, self.devlist, self.mds_name) + lctl.newdev(attach="lov %s %s" % (self.name, self.uuid), + setup ="%s" % (mdc_uuid)) + def cleanup(self): + for osc_uuid in self.devlist: + osc = lookup(self.dom_node.parentNode, osc_uuid) + if osc: + n = OSC(osc) + n.cleanup() + else: + panic('osc not found:', osc_uuid) + Module.cleanup(self) + cleanup_mdc(self.dom_node.parentNode, self.mds_uuid) + def load_module(self): + for osc_uuid in self.devlist: + osc = lookup(self.dom_node.parentNode, osc_uuid) + if osc: + n = OSC(osc) + n.load_module() + break + else: + panic('osc not found:', osc_uuid) + Module.load_module(self) + def cleanup_module(self): + Module.cleanup_module(self) + for osc_uuid in self.devlist: + osc = lookup(self.dom_node.parentNode, osc_uuid) + if osc: + n = OSC(osc) + n.cleanup_module() + break + else: + panic('osc not found:', osc_uuid) + +class LOVConfig(Module): + def __init__(self,dom_node): + Module.__init__(self, 'LOVConfig', dom_node) + self.lov_uuid = get_first_ref(dom_node, 'lov') + l = lookup(dom_node.parentNode, self.lov_uuid) + self.lov = LOV(l) + def prepare(self): - self.info(self.mdsuuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, - self.devlist, self.mdsname) - lctl.lovconfig(self.uuid, self.mdsname, self.stripe_cnt, - self.stripe_sz, self.stripe_off, self.pattern, - self.devlist) + lov = self.lov + self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern, + lov.devlist, lov.mds_name) + lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt, + lov.stripe_sz, lov.stripe_off, lov.pattern, + string.join(lov.devlist)) def cleanup(self): + #nothing to do here pass + class MDS(Module): - def __init__(self,node): - Module.__init__(self, 'MDS', node) - self.devname, self.size = getDevice(node) - self.fstype = getText(node, 'fstype') - self.format = getText(node, 'autoformat', "no") - self.add_module('mds') - self.add_module('mds_%s' % (self.fstype)) + def __init__(self,dom_node): + Module.__init__(self, 'MDS', dom_node) + self.devname, self.size = get_device(dom_node) + self.fstype = get_text(dom_node, 'fstype') + self.format = get_text(dom_node, 'autoformat', "no") + if self.fstype == 'extN': + self.add_module('lustre/extN', 'extN') + self.add_module('lustre/mds', 'mds') + self.add_module('lustre/mds', 'mds_%s' % (self.fstype)) def prepare(self): self.info(self.devname, self.fstype, self.format) @@ -625,181 +881,257 @@ class MDS(Module): Module.cleanup(self) clean_loop(self.devname) +# Very unusual case, as there is no MDC element in the XML anymore +# Builds itself from an MDS node class MDC(Module): - def __init__(self,node): - Module.__init__(self, 'MDC', node) - ref = node.getElementsByTagName('mds_ref')[0] - self.mds_uuid = ref.getAttribute('uuidref') - self.add_module('mdc') + def __init__(self,dom_node): + self.mds = MDS(dom_node) + self.dom_node = dom_node + self.module_name = 'MDC' + self.kmodule_list = [] + self._server = None + self._connected = 0 + + host = socket.gethostname() + self.name = 'MDC_'+host + self.uuid = self.name+'_UUID' + + self.lookup_server(self.mds.uuid) + self.add_module('lustre/mdc', 'mdc') def prepare(self): - self.info(self.mds_uuid) - mds = lookup(self.dom_node.parentNode, self.mds_uuid) - if mds == None: - panic(self.mdsuuid, "not found.") - net = get_ost_net(self.dom_node.parentNode, self.mds_uuid) - srv = Network(net) - lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_buf, srv.read_buf) + self.info(self.mds.uuid) + srv = self.get_server() + lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid), - setup ="%s %s" %(self.mds_uuid, srv.uuid)) + setup ="%s %s" %(self.mds.uuid, srv.uuid)) - def cleanup(self): - self.info(self.mds_uuid) - net = get_ost_net(self.dom_node.parentNode, self.mds_uuid) - srv = Network(net) - try: - lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) - lctl.cleanup(self.name, self.uuid) - except CommandError: - print "cleanup failed: ", self.name - class OBD(Module): - def __init__(self, node): - Module.__init__(self, 'OBD', node) - self.obdtype = node.getAttribute('type') - self.devname, self.size = getDevice(node) - self.fstype = getText(node, 'fstype') - self.format = getText(node, 'autoformat', 'yes') - self.add_module(self.obdtype) + def __init__(self, dom_node): + Module.__init__(self, 'OBD', dom_node) + self.obdtype = get_attr(dom_node, 'type') + self.devname, self.size = get_device(dom_node) + self.fstype = get_text(dom_node, 'fstype') + self.format = get_text(dom_node, 'autoformat', 'yes') + if self.fstype == 'extN': + self.add_module('lustre/extN', 'extN') + self.add_module('lustre/' + self.obdtype, self.obdtype) # need to check /proc/mounts and /etc/mtab before # formatting anything. # FIXME: check if device is already formatted. def prepare(self): self.info(self.obdtype, self.devname, self.size, self.fstype, self.format) - blkdev = block_dev(self.devname, self.size, self.fstype, self.format) + if self.obdtype == 'obdecho': + blkdev = '' + else: + blkdev = block_dev(self.devname, self.size, self.fstype, self.format) lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) def cleanup(self): Module.cleanup(self) - clean_loop(self.devname) + if not self.obdtype == 'obdecho': + clean_loop(self.devname) class OST(Module): - def __init__(self,node): - Module.__init__(self, 'OST', node) - ref = node.getElementsByTagName('obd_ref')[0] - self.obd_uuid = ref.getAttribute('uuidref') - self.add_module('ost') + def __init__(self,dom_node): + Module.__init__(self, 'OST', dom_node) + self.obd_uuid = get_first_ref(dom_node, 'obd') + self.add_module('lustre/ost', 'ost') def prepare(self): self.info(self.obd_uuid) lctl.newdev(attach="ost %s %s" % (self.name, self.uuid), setup ="%s" % (self.obd_uuid)) + +# virtual interface for OSC and LOV +class VOSC(Module): + def __init__(self,dom_node): + Module.__init__(self, 'VOSC', dom_node) + if dom_node.nodeName == 'lov': + self.osc = LOV(dom_node) + else: + self.osc = OSC(dom_node) + def prepare(self): + self.osc.prepare() + def cleanup(self): + self.osc.cleanup() + def load_module(self): + self.osc.load_module() + def cleanup_module(self): + self.osc.cleanup_module() + + class OSC(Module): - def __init__(self,node): - Module.__init__(self, 'OSC', node) - ref = node.getElementsByTagName('obd_ref')[0] - self.obd_uuid = ref.getAttribute('uuidref') - ref = node.getElementsByTagName('ost_ref')[0] - self.ost_uuid = ref.getAttribute('uuidref') - self.add_module('osc') + def __init__(self,dom_node): + Module.__init__(self, 'OSC', dom_node) + self.obd_uuid = get_first_ref(dom_node, 'obd') + self.ost_uuid = get_first_ref(dom_node, 'ost') + self.lookup_server(self.ost_uuid) + self.add_module('lustre/osc', 'osc') def prepare(self): self.info(self.obd_uuid, self.ost_uuid) - net = get_ost_net(self.dom_node.parentNode, self.ost_uuid) - srv = Network(net) - lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_buf, srv.read_buf) + srv = self.get_server() + if local_net(srv): + lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) + else: + r = find_route(srv) + if r: + lctl.add_route_host(r[0], srv.uuid, r[1], r[2]) + else: + panic ("no route to", srv.nid) + lctl.newdev(attach="osc %s %s" % (self.name, self.uuid), setup ="%s %s" %(self.obd_uuid, srv.uuid)) def cleanup(self): - self.info(self.obd_uuid, self.ost_uuid) - net_uuid = get_ost_net(self.dom_node.parentNode, self.ost_uuid) - srv = Network(net_uuid) - try: - lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) - lctl.cleanup(self.name, self.uuid) - except CommandError: - print "cleanup failed: ", self.name + srv = self.get_server() + if local_net(srv): + Module.cleanup(self) + else: + self.info(self.obd_uuid, self.ost_uuid) + r = find_route(srv) + if r: + lctl.del_route_host(r[0], srv.uuid, r[1], r[2]) + Module.cleanup(self) + class Mountpoint(Module): - def __init__(self,node): - Module.__init__(self, 'MTPT', node) - self.path = getText(node, 'path') - ref = node.getElementsByTagName('mdc_ref')[0] - self.mdc_uuid = ref.getAttribute('uuidref') - ref = node.getElementsByTagName('osc_ref')[0] - self.lov_uuid = ref.getAttribute('uuidref') - self.add_module('osc') - self.add_module('llite') + def __init__(self,dom_node): + Module.__init__(self, 'MTPT', dom_node) + self.path = get_text(dom_node, 'path') + self.mds_uuid = get_first_ref(dom_node, 'mds') + self.lov_uuid = get_first_ref(dom_node, 'osc') + self.add_module('lustre/mdc', 'mdc') + self.add_module('lustre/llite', 'llite') + l = lookup(self.dom_node.parentNode, self.lov_uuid) + self.osc = VOSC(l) def prepare(self): - l = lookup(self.dom_node.parentNode, self.lov_uuid) - if l.nodeName == 'lov': - lov = LOV(l) - for osc_uuid in string.split(lov.devlist): - osc = lookup(self.dom_node.parentNode, osc_uuid) - if osc: - n = OSC(osc) - n.prepare() - else: - panic('osc not found:', osc_uuid) - lctl.newdev(attach="lov %s %s" % (lov.name, lov.uuid), - setup ="%s" % (self.mdc_uuid)) - else: - osc = OSC(l) - osc.prepare() - - self.info(self.path, self.mdc_uuid,self.lov_uuid) + self.osc.prepare() + mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid) + + self.info(self.path, self.mds_uuid,self.lov_uuid) cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ - (self.lov_uuid, self.mdc_uuid, self.path) + (self.lov_uuid, mdc_uuid, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: panic("mount failed:", self.path) + def cleanup(self): - self.info(self.path, self.mdc_uuid,self.lov_uuid) - run("umount", self.path) + self.info(self.path, self.mds_uuid,self.lov_uuid) + (rc, out) = run("umount", self.path) + if rc: + log("umount failed, cleanup will most likely not work.") l = lookup(self.dom_node.parentNode, self.lov_uuid) - if l.nodeName == 'lov': - lov = LOV(l) - for osc_uuid in string.split(lov.devlist): - osc = lookup(self.dom_node.parentNode, osc_uuid) - if osc: - n = OSC(osc) - n.cleanup() - else: - panic('osc not found:', osc_uuid) - lov.cleanup() - else: - osc = OSC(l) - osc.cleanup() - + self.osc.cleanup() + cleanup_mdc(self.dom_node.parentNode, self.mds_uuid) + + def load_module(self): + self.osc.load_module() + Module.load_module(self) + def cleanup_module(self): + Module.cleanup_module(self) + self.osc.cleanup_module() + + # ============================================================ # XML processing and query # TODO: Change query funcs to use XPath, which is muc cleaner -def getDevice(obd): - dev = obd.getElementsByTagName('device')[0] - dev.normalize(); - try: - size = int(dev.getAttribute('size')) - except ValueError: - size = 0 - return dev.firstChild.data, size +def get_device(obd): + list = obd.getElementsByTagName('device') + if len(list) > 0: + dev = list[0] + dev.normalize(); + size = get_attr_int(dev, 'size', 0) + return dev.firstChild.data, size + return '', 0 # Get the text content from the first matching child -def getText(node, tag, default=""): - list = node.getElementsByTagName(tag) +# If there is no content (or it is all whitespace), return +# the default +def get_text(dom_node, tag, default=""): + list = dom_node.getElementsByTagName(tag) if len(list) > 0: - node = list[0] - node.normalize() - return node.firstChild.data - else: - return default - -def get_ost_net(node, uuid): - ost = lookup(node, uuid) - list = ost.getElementsByTagName('network_ref') - if list: - uuid = list[0].getAttribute('uuidref') - else: + dom_node = list[0] + dom_node.normalize() + if dom_node.firstChild: + txt = string.strip(dom_node.firstChild.data) + if txt: + return txt + return default + +def get_text_int(dom_node, tag, default=0): + list = dom_node.getElementsByTagName(tag) + n = default + if len(list) > 0: + dom_node = list[0] + dom_node.normalize() + if dom_node.firstChild: + txt = string.strip(dom_node.firstChild.data) + if txt: + try: + n = int(txt) + except ValueError: + panic("text value is not integer:", txt) + return n + +def get_attr(dom_node, attr, default=""): + v = dom_node.getAttribute(attr) + if v: + return v + return default + +def get_attr_int(dom_node, attr, default=0): + n = default + v = dom_node.getAttribute(attr) + if v: + try: + n = int(v) + except ValueError: + panic("attr value is not integer", v) + return n + +def get_first_ref(dom_node, tag): + """ Get the first uuidref of the type TAG. Used one only + one is expected. Returns the uuid.""" + uuid = None + refname = '%s_ref' % tag + list = dom_node.getElementsByTagName(refname) + if len(list) > 0: + uuid = getRef(list[0]) + return uuid + +def get_all_refs(dom_node, tag): + """ Get all the refs of type TAG. Returns list of uuids. """ + uuids = [] + refname = '%s_ref' % tag + list = dom_node.getElementsByTagName(refname) + if len(list) > 0: + for i in list: + uuids.append(getRef(i)) + return uuids + +def get_ost_net(dom_node, uuid): + ost = lookup(dom_node, uuid) + uuid = get_first_ref(ost, 'network') + if not uuid: return None - return lookup(node, uuid) + return lookup(dom_node, uuid) + +def nid2server(dom_node, nid): + netlist = dom_node.getElementsByTagName('network') + for net_node in netlist: + if get_text(net_node, 'server') == nid: + return Network(net_node) + return None -def lookup(node, uuid): - for n in node.childNodes: +def lookup(dom_node, uuid): + for n in dom_node.childNodes: if n.nodeType == n.ELEMENT_NODE: if getUUID(n) == uuid: return n @@ -808,48 +1140,46 @@ def lookup(node, uuid): if n: return n return None -# Get name attribute of node -def getName(node): - return node.getAttribute('name') +# Get name attribute of dom_node +def getName(dom_node): + return dom_node.getAttribute('name') -def getRef(node): - return node.getAttribute('uuidref') +def getRef(dom_node): + return dom_node.getAttribute('uuidref') -# Get name attribute of node -def getUUID(node): - return node.getAttribute('uuid') +# Get name attribute of dom_node +def getUUID(dom_node): + return dom_node.getAttribute('uuid') # the tag name is the service type -# fixme: this should do some checks to make sure the node is a service -def getServiceType(node): - return node.nodeName +# fixme: this should do some checks to make sure the dom_node is a service +def getServiceType(dom_node): + return dom_node.nodeName # # determine what "level" a particular node is at. -# the order of iniitailization is based on level. objects -# are assigned a level based on type: -# net,devices,ldlm:1, obd, mdd:2 mds,ost:3 osc,mdc:4 mounts:5 -def getServiceLevel(node): - type = getServiceType(node) +# the order of iniitailization is based on level. +def getServiceLevel(dom_node): + type = getServiceType(dom_node) if type in ('network',): - return 1 - if type in ('device', 'ldlm'): - return 2 + return 10 + elif type in ('device', 'ldlm'): + return 20 elif type in ('obd', 'mdd'): - return 3 + return 30 elif type in ('mds','ost'): - return 4 + return 40 elif type in ('mdc','osc'): - return 5 - elif type in ('lov',): - return 6 + return 50 + elif type in ('lov', 'lovconfig'): + return 60 elif type in ('mountpoint',): - return 7 + return 70 return 0 # # return list of services in a profile. list is a list of tuples -# [(level, node),] +# [(level, dom_node),] def getServices(lustreNode, profileNode): list = [] for n in profileNode.childNodes: @@ -863,7 +1193,7 @@ def getServices(lustreNode, profileNode): list.sort() return list -def getByName(lustreNode, tag, name): +def getByName(lustreNode, name, tag): ndList = lustreNode.getElementsByTagName(tag) for nd in ndList: if getName(nd) == name: @@ -871,46 +1201,155 @@ def getByName(lustreNode, tag, name): return None -# ============================================================ +############################################################ +# MDC UUID hack - +# FIXME: clean this mess up! +# +mdc_uuid = None +def prepare_mdc(dom_node, mds_uuid): + global mdc_uuid + mds_node = lookup(dom_node, mds_uuid); + if not mds_node: + panic("no mds:", mds_uuid) + if mdc_uuid: + return mdc_uuid + mdc = MDC(mds_node) + mdc.prepare() + mdc_uuid = mdc.uuid + return mdc_uuid + +mdc_cleaned = None +def cleanup_mdc(dom_node, mds_uuid): + global mdc_cleaned + mds_node = lookup(dom_node, mds_uuid); + if not mds_node: + panic("no mds:", mds_uuid) + if not mdc_cleaned: + mdc = MDC(mds_node) + mdc.cleanup() + mdc_uuid = None + mdc_cleaned = 'yes' + + +############################################################ +# routing ("rooting") +# +routes = [] +local_node = [] +router_flag = 0 + +def init_node(dom_node): + global local_node, router_flag + netlist = dom_node.getElementsByTagName('network') + for dom_net in netlist: + type = get_attr(dom_net, 'type') + gw = get_text(dom_net, 'server') + local_node.append((type, gw)) + +def node_needs_router(): + return router_flag + +def get_routes(type, gw, dom_net): + """ Return the routes as a list of tuples of the form: + [(type, gw, lo, hi),]""" + res = [] + tbl = dom_net.getElementsByTagName('route_tbl') + for t in tbl: + routes = t.getElementsByTagName('route') + for r in routes: + lo = get_attr(r, 'lo') + hi = get_attr(r, 'hi', '') + res.append((type, gw, lo, hi)) + return res + + +def init_route_config(lustre): + """ Scan the lustre config looking for routers. Build list of + routes. """ + global routes, router_flag + routes = [] + list = lustre.getElementsByTagName('node') + for node in list: + if get_attr(node, 'router'): + router_flag = 1 + for (local_type, local_nid) in local_node: + gw = None + netlist = node.getElementsByTagName('network') + for dom_net in netlist: + if local_type == get_attr(dom_net, 'type'): + gw = get_text(dom_net, 'server') + break + if not gw: + continue + for dom_net in netlist: + if local_type != get_attr(dom_net, 'type'): + for route in get_routes(local_type, gw, dom_net): + routes.append(route) + + +def local_net(net): + global local_node + for iface in local_node: + if net.net_type == iface[0]: + return 1 + return 0 + +def find_route(net): + global local_node, routes + frm_type = local_node[0][0] + to_type = net.net_type + to = net.nid + debug ('looking for route to', to_type,to) + for r in routes: + if r[2] == to: + return r + return None + + + + +############################################################ # lconf level logic # Start a service. -def startService(node, clean_flag, module_flag): - type = getServiceType(node) - debug('Service:', type, getName(node), getUUID(node)) +def startService(dom_node, module_flag): + type = getServiceType(dom_node) + debug('Service:', type, getName(dom_node), getUUID(dom_node)) # there must be a more dynamic way of doing this... n = None if type == 'ldlm': - n = LDLM(node) + n = LDLM(dom_node) elif type == 'lov': - n = LOV(node) + n = LOV(dom_node) + elif type == 'lovconfig': + n = LOVConfig(dom_node) elif type == 'network': - n = Network(node) + n = Network(dom_node) elif type == 'obd': - n = OBD(node) + n = OBD(dom_node) elif type == 'ost': - n = OST(node) + n = OST(dom_node) elif type == 'mds': - n = MDS(node) + n = MDS(dom_node) elif type == 'osc': - n = OSC(node) + n = VOSC(dom_node) elif type == 'mdc': - n = MDC(node) + n = MDC(dom_node) elif type == 'mountpoint': - n = Mountpoint(node) + n = Mountpoint(dom_node) else: panic ("unknown service type:", type) if module_flag: if config.nomod(): return - if clean_flag: + if config.cleanup(): n.cleanup_module() else: n.load_module() else: if config.nosetup(): return - if clean_flag: + if config.cleanup(): n.cleanup() else: n.prepare() @@ -923,37 +1362,46 @@ def startService(node, clean_flag, module_flag): # * make sure partitions are in place and prepared # * initialize devices with lctl # Levels is important, and needs to be enforced. -def startProfile(lustreNode, profileNode, clean_flag, module_flag): +def startProfile(lustreNode, profileNode, module_flag): if not profileNode: panic("profile:", profile, "not found.") services = getServices(lustreNode, profileNode) - if clean_flag: + if config.cleanup(): services.reverse() for s in services: - startService(s[1], clean_flag, module_flag) + startService(s[1], module_flag) + # # Load profile for -def doHost(lustreNode, hosts, clean_flag): - node = None +def doHost(lustreNode, hosts): + global routes + dom_node = None for h in hosts: - node = getByName(lustreNode, 'node', h) - if node: + dom_node = getByName(lustreNode, h, 'node') + if dom_node: break - if not node: + if not dom_node: print 'No host entry found.' return + if not get_attr(dom_node, 'router'): + init_node(dom_node) + init_route_config(lustreNode) + else: + global router_flag + router_flag = 1 + # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. - module_flag = not clean_flag - reflist = node.getElementsByTagName('profile') + module_flag = not config.cleanup() + reflist = dom_node.getElementsByTagName('profile') for profile in reflist: - startProfile(lustreNode, profile, clean_flag, module_flag) + startProfile(lustreNode, profile, module_flag) - if not clean_flag: - setDebugPath() + if not config.cleanup(): + sys_set_debug_path() script = config.gdb_script() run(lctl.lctl, ' modules >', script) if config.gdb(): @@ -963,31 +1411,33 @@ def doHost(lustreNode, hosts, clean_flag): module_flag = not module_flag for profile in reflist: - startProfile(lustreNode, profile, clean_flag, module_flag) + startProfile(lustreNode, profile, module_flag) +############################################################ # Command line processing # def parse_cmdline(argv): - short_opts = "hdv" + short_opts = "hdnv" long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb", - "portals=", "makeldiff", "cleanup", - "help", "debug", "node=", "get=", "nomod", "nosetup"] + "portals=", "makeldiff", "cleanup", "noexec", + "help", "node=", "get=", "nomod", "nosetup", + "dump="] opts = [] args = [] try: opts, args = getopt.getopt(argv, short_opts, long_opts) - except getopt.GetoptError: + except getopt.error: print "invalid opt" usage() for o, a in opts: if o in ("-h", "--help"): usage() - if o == "--cleanup": + if o in ("-d","--cleanup"): config.cleanup(1) if o in ("-v", "--verbose"): config.verbose(1) - if o in ("-d", "--debug"): + if o in ("-n", "--noexec"): config.noexec(1) config.verbose(1) if o == "--portals": @@ -1006,6 +1456,8 @@ def parse_cmdline(argv): config.nomod(1) if o == "--nosetup": config.nosetup(1) + if o == "--dump": + config.dump_file(a) return args def fetch(url): @@ -1023,14 +1475,34 @@ def setupModulePath(cmd): if os.access(base+"/Makefile", os.R_OK): config.src_dir(base + "/../../") -def setDebugPath(): +def sys_set_debug_path(): debug("debug path: ", config.debug_path()) - fp = open('/proc/sys/portals/debug_path', 'w') - fp.write(config.debug_path()) - fp.close() + if config.noexec(): + return + try: + fp = open('/proc/sys/portals/debug_path', 'w') + fp.write(config.debug_path()) + fp.close() + except IOError, e: + print e +#/proc/sys/net/core/rmem_max +#/proc/sys/net/core/wmem_max +def sys_set_netmem_max(path, max): + debug("setting", path, "to at least", max) + if config.noexec(): + return + fp = open(path) + str = fp.readline() + fp.close + cur = int(str) + if max > cur: + fp = open(path, 'w') + fp.write('%d\n' %(max)) + fp.close() + -def makeDevices(): +def sys_make_devices(): if not os.access('/dev/portals', os.R_OK): run('mknod /dev/portals c 10 240') if not os.access('/dev/obd', os.R_OK): @@ -1042,7 +1514,9 @@ def makeDevices(): # Shutdown does steps in reverse # def main(): - global TCP_ACCEPTOR, lctl + global TCP_ACCEPTOR, lctl, MAXTCPBUF + host = socket.gethostname() + args = parse_cmdline(sys.argv[1:]) if len(args) > 0: if not os.access(args[0], os.R_OK | os.W_OK): @@ -1059,11 +1533,14 @@ def main(): if config.node(): node_list.append(config.node()) else: - host = socket.gethostname() if len(host) > 0: node_list.append(host) node_list.append('localhost') - print "configuring for host: ", node_list + debug("configuring for host: ", node_list) + + if len(host) > 0: + config._debug_path = config._debug_path + '-' + host + config._gdb_script = config._gdb_script + '-' + host TCP_ACCEPTOR = find_prog('acceptor') if not TCP_ACCEPTOR: @@ -1076,14 +1553,15 @@ def main(): lctl = LCTLInterface('lctl') setupModulePath(sys.argv[0]) - makeDevices() - doHost(dom.documentElement, node_list, config.cleanup()) + sys_make_devices() + sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) + sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) + doHost(dom.documentElement, node_list) if __name__ == "__main__": try: main() - except RuntimeError: - pass - except CommandError: - print 'FIXME: insert exception data here' - + except LconfError, e: + print e + except CommandError, e: + e.dump()