#!/usr/bin/env python # # Copyright (C) 2002 Cluster File Systems, Inc. # Author: Robert Read # This file is part of Lustre, http://www.lustre.org. # # Lustre is free software; you can redistribute it and/or # modify it under the terms of version 2 of the GNU General Public # License as published by the Free Software Foundation. # # Lustre is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Lustre; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # lconf - lustre configuration tool # # lconf is the main driver script for starting and stopping # lustre filesystem services. # # Based in part on the XML obdctl modifications done by Brian Behlendorf import sys, getopt import string, os, stat, popen2, socket, time import re, exceptions import xml.dom.minidom # Global parameters TCP_ACCEPTOR = '' MAXTCPBUF = 1048576 # # Maximum number of devices to search for. # (the /dev/loop* nodes need to be created beforehand) MAX_LOOP_DEVICES = 256 def usage(): print """usage: lconf config.xml config.xml Lustre configuration in xml format. --get URL to fetch a config file --node Load config for -d | --cleanup Cleans up config. (Shutdown) -v | --verbose Print system commands as they are run -h | --help Print this help --gdb Prints message after creating gdb module script and sleeps for 5 seconds. -n | --noexec Prints the commands and steps that will be run for a config without executing them. This can used to check if a config file is doing what it should be doing. (Implies -v) --nomod Skip load/unload module step. --nosetup Skip device setup/cleanup step. --reformat Reformat all devices (without question) """ TODO = """ --ldap server LDAP server with lustre config database --makeldiff Translate xml source to LDIFF This are perhaps not needed: --lustre="src dir" Base directory of lustre sources. Used to search for modules. --portals=src Portals source """ sys.exit() # ============================================================ # Config parameters, encapsulated in a class class Config: def __init__(self): # flags self._noexec = 0 self._verbose = 0 self._reformat = 0 self._cleanup = 0 self._gdb = 0 self._nomod = 0 self._nosetup = 0 # parameters self._modules = None self._node = None self._url = None self._gdb_script = '/tmp/ogdb' self._debug_path = '/tmp/lustre-log' self._src_dir = None def verbose(self, flag = None): if flag: self._verbose = flag return self._verbose def noexec(self, flag = None): if flag: self._noexec = flag return self._noexec def reformat(self, flag = None): if flag: self._reformat = flag return self._reformat def cleanup(self, flag = None): if flag: self._cleanup = flag return self._cleanup def gdb(self, flag = None): if flag: self._gdb = flag return self._gdb def nomod(self, flag = None): if flag: self._nomod = flag return self._nomod def nosetup(self, flag = None): if flag: self._nosetup = flag return self._nosetup def node(self, val = None): if val: self._node = val return self._node def url(self, val = None): if val: self._url = val return self._url def gdb_script(self): if os.path.isdir('/r'): return '/r' + self._gdb_script else: return self._gdb_script def debug_path(self): if os.path.isdir('/r'): return '/r' + self._debug_path else: return self._debug_path def src_dir(self, val = None): if val: self._url = val return self._url config = Config() # ============================================================ # debugging and error funcs def fixme(msg = "this feature"): raise LconfError, msg + ' not implmemented yet.' def panic(*args): msg = string.join(map(str,args)) if not config.noexec(): raise LconfError(msg) else: print "! " + msg def log(*args): msg = string.join(map(str,args)) print msg def logall(msgs): for s in msgs: print string.strip(s) def debug(*args): if config.verbose(): msg = string.join(map(str,args)) print msg # ============================================================ # locally defined exceptions class CommandError (exceptions.Exception): def __init__(self, cmd_name, cmd_err, rc=None): self.cmd_name = cmd_name self.cmd_err = cmd_err self.rc = rc def dump(self): import types if type(self.cmd_err) == types.StringType: if self.rc: print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err) else: print "! %s: %s" % (self.cmd_name, self.cmd_err) elif type(self.cmd_err) == types.ListType: if self.rc: print "! %s (error %d):" % (self.cmd_name, self.rc) else: print "! %s:" % (self.cmd_name) for s in self.cmd_err: print "> %s" %(string.strip(s)) else: print self.cmd_err class LconfError (exceptions.Exception): def __init__(self, args): self.args = args # ============================================================ # handle lctl interface class LCTLInterface: """ Manage communication with lctl """ def __init__(self, cmd): """ Initialize close by finding the lctl binary. """ self.lctl = find_prog(cmd) if not self.lctl: if config.noexec(): debug('! lctl not found') self.lctl = 'lctl' else: raise CommandError('lctl', "unable to find lctl binary.") def run(self, cmds): """ run lctl the cmds are written to stdin of lctl lctl doesn't return errors when run in script mode, so stderr is checked should modify command line to accept multiple commands, or create complex command line options """ debug("+", self.lctl, cmds) if config.noexec(): return (0, []) p = popen2.Popen3(self.lctl, 1) p.tochild.write(cmds + "\n") p.tochild.close() out = p.fromchild.readlines() err = p.childerr.readlines() ret = p.wait() if ret or len(err): raise CommandError(self.lctl, err, ret) return ret, out def network(self, net, nid): """ initialized network and add "self" """ # Idea: "mynid" could be used for all network types to add "self," and then # this special case would be gone and the "self" hack would be hidden. if net == 'tcp': cmds = """ network %s mynid %s add_uuid self %s quit""" % (net, nid, nid) else: cmds = """ network %s add_uuid self %s quit""" % (net, nid) self.run(cmds) # create a new connection def connect(self, net, nid, port, servuuid, send_mem, recv_mem): if net == 'tcp': cmds = """ network %s add_uuid %s %s send_mem %d recv_mem %d connect %s %d quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, ) else: cmds = """ network %s add_uuid %s %s connect %s %d quit""" % (net, servuuid, nid, nid, port, ) self.run(cmds) # add a route to a range def add_route(self, net, gw, lo, hi): cmds = """ network %s add_route %s %s %s quit """ % (net, gw, lo, hi) self.run(cmds) # add a route to a range def del_route(self, net, gw, lo, hi): cmds = """ network %s del_route %s quit """ % (net, lo) self.run(cmds) # add a route to a host def add_route_host(self, net, uuid, gw, tgt): cmds = """ network %s add_uuid %s %s add_route %s %s quit """ % (net, uuid, tgt, gw, tgt) self.run(cmds) # disconnect one connection def disconnect(self, net, nid, port, servuuid): cmds = """ network %s disconnect %s del_uuid %s quit""" % (net, nid, servuuid) self.run(cmds) # disconnect all connections def disconnectAll(self, net): cmds = """ network %s disconnect del_uuid self quit""" % (net) self.run(cmds) # create a new device with lctl def newdev(self, attach, setup = ""): cmds = """ newdev attach %s setup %s quit""" % (attach, setup) self.run(cmds) # cleanup a device def cleanup(self, name, uuid): cmds = """ device $%s cleanup detach quit""" % (name) self.run(cmds) # create an lov def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist): cmds = """ device $%s probe lovconfig %s %d %d %d %s %s quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist) self.run(cmds) # ============================================================ # Various system-level functions # (ideally moved to their own module) # Run a command and return the output and status. # stderr is sent to /dev/null, could use popen3 to # save it if necessary def run(*args): cmd = string.join(map(str,args)) debug ("+", cmd) if config.noexec(): return (0, []) f = os.popen(cmd + ' 2>&1') out = f.readlines() ret = f.close() if ret: ret = ret >> 8 else: ret = 0 return (ret, out) # Run a command in the background. def run_daemon(*args): cmd = string.join(map(str,args)) debug ("+", cmd) if config.noexec(): return 0 f = os.popen(cmd + ' 2>&1') ret = f.close() if ret: ret = ret >> 8 else: ret = 0 return ret # Determine full path to use for an external command # searches dirname(argv[0]) first, then PATH def find_prog(cmd): syspath = string.split(os.environ['PATH'], ':') cmdpath = os.path.dirname(sys.argv[0]) syspath.insert(0, cmdpath); syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/')) for d in syspath: prog = os.path.join(d,cmd) if os.access(prog, os.X_OK): return prog return '' # Recursively look for file starting at base dir def do_find_file(base, mod): fullname = os.path.join(base, mod) if os.access(fullname, os.R_OK): return fullname for d in os.listdir(base): dir = os.path.join(base,d) if os.path.isdir(dir): module = do_find_file(dir, mod) if module: return module def find_module(src_dir, dev_dir, modname): mod = '%s.o' % (modname) module = src_dir +'/'+ dev_dir +'/'+ mod try: if os.access(module, os.R_OK): return module except OSError: pass return None # is the path a block device? def is_block(path): s = () try: s = os.stat(path) except OSError: return 0 return stat.S_ISBLK(s[stat.ST_MODE]) # build fs according to type # fixme: dangerous def mkfs(fstype, dev): if(fstype in ('ext3', 'extN')): mkfs = 'mkfs.ext2 -j -b 4096' else: print 'unsupported fs type: ', fstype if not is_block(dev): force = '-F' else: force = '' (ret, out) = run (mkfs, force, dev) if ret: panic("Unable to build fs:", dev) # enable hash tree indexing on fs if fstype == 'extN': htree = 'echo "feature FEATURE_C5" | debugfs -w' (ret, out) = run (htree, dev) if ret: panic("Unable to enable htree:", dev) # some systems use /dev/loopN, some /dev/loop/N def loop_base(): import re loop = '/dev/loop' if not os.access(loop + str(0), os.R_OK): loop = loop + '/' if not os.access(loop + str(0), os.R_OK): panic ("can't access loop devices") return loop # find loop device assigned to thefile def find_loop(file): loop = loop_base() for n in xrange(0, MAX_LOOP_DEVICES): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) if (out and stat == 0): m = re.search(r'\((.*)\)', out[0]) if m and file == m.group(1): return dev else: break return '' # create file if necessary and assign the first free loop device def init_loop(file, size, fstype): dev = find_loop(file) if dev: print 'WARNING file:', file, 'already mapped to', dev return dev if not os.access(file, os.R_OK | os.W_OK): run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file)) loop = loop_base() # find next free loop for n in xrange(0, MAX_LOOP_DEVICES): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) if (stat): run('losetup', dev, file) return dev else: print "out of loop devices" return '' print "out of loop devices" return '' # undo loop assignment def clean_loop(file): dev = find_loop(file) if dev: ret, out = run('losetup -d', dev) if ret: log('unable to clean loop device:', dev, 'for file:', file) logall(out) # determine if dev is formatted as a filesystem def need_format(fstype, dev): # FIXME don't know how to implement this return 0 # initialize a block device if needed def block_dev(dev, size, fstype, format): if config.noexec(): return dev if not is_block(dev): dev = init_loop(dev, size, fstype) if config.reformat() or (need_format(fstype, dev) and format == 'yes'): mkfs(fstype, dev) # else: # panic("device:", dev, # "not prepared, and autoformat is not set.\n", # "Rerun with --reformat option to format ALL filesystems") return dev def get_local_address(net_type): """Return the local address for the network type.""" local = "" if net_type == 'tcp': # host `hostname` host = socket.gethostname() local = socket.gethostbyname(host) elif net_type == 'elan': # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position' try: fp = open('/proc/elan/device0/position', 'r') lines = fp.readlines() fp.close() for l in lines: a = string.split(l) if a[0] == 'NodeId': local = a[1] break except IOError, e: log(e) elif net_type == 'gm': fixme("automatic local address for GM") return local # ============================================================ # Classes to prepare and cleanup the various objects # class Module: """ Base class for the rest of the modules. The default cleanup method is defined here, as well as some utilitiy funcs. """ def __init__(self, module_name, dom_node): self.dom_node = dom_node self.module_name = module_name self.name = get_attr(dom_node, 'name') self.uuid = get_attr(dom_node, 'uuid') self.kmodule_list = [] self._server = None self._connected = 0 def info(self, *args): msg = string.join(map(str,args)) print self.module_name + ":", self.name, self.uuid, msg def lookup_server(self, srv_uuid): """ Lookup a server's network information """ net = get_ost_net(self.dom_node.parentNode, srv_uuid) self._server = Network(net) def get_server(self): return self._server def cleanup(self): """ default cleanup, used for most modules """ self.info() srv = self.get_server() if srv and local_net(srv): try: lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) except CommandError, e: log(self.module_name, "disconnect failed: ", self.name) e.dump() try: lctl.cleanup(self.name, self.uuid) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() def add_module(self, dev_dir, modname): """Append a module to list of modules to load.""" self.kmodule_list.append((dev_dir, modname)) def mod_loaded(self, modname): """Check if a module is already loaded. Look in /proc/modules for it.""" fp = open('/proc/modules') lines = fp.readlines() fp.close() # please forgive my tired fingers for this one ret = filter(lambda word, mod=modname: word == mod, map(lambda line: string.split(line)[0], lines)) return ret def load_module(self): """Load all the modules in the list in the order they appear.""" for dev_dir, mod in self.kmodule_list: # (rc, out) = run ('/sbin/lsmod | grep -s', mod) if self.mod_loaded(mod) and not config.noexec(): continue log ('loading module:', mod) if config.src_dir(): module = find_module(config.src_dir(),dev_dir, mod) if not module: panic('module not found:', mod) (rc, out) = run('/sbin/insmod', module) if rc: raise CommandError('insmod', out, rc) else: (rc, out) = run('/sbin/modprobe', mod) if rc: raise CommandError('modprobe', out, rc) def cleanup_module(self): """Unload the modules in the list in reverse order.""" rev = self.kmodule_list rev.reverse() for dev_dir, mod in rev: if not self.mod_loaded(mod): continue log('unloading module:', mod) if config.noexec(): continue (rc, out) = run('/sbin/rmmod', mod) if rc: log('! unable to unload module:', mod) logall(out) class Network(Module): def __init__(self,dom_node): Module.__init__(self, 'NETWORK', dom_node) self.net_type = get_attr(dom_node,'type') self.nid = get_text(dom_node, 'server', '*') self.port = get_text_int(dom_node, 'port', 0) self.send_mem = get_text_int(dom_node, 'send_mem', 65536) self.recv_mem = get_text_int(dom_node, 'recv_mem', 65536) if self.nid == '*': self.nid = get_local_address(self.net_type) if not self.nid: panic("unable to set nid for", self.net_type) self.add_module('portals/linux/oslib/', 'portals') if node_needs_router(): self.add_module('portals/linux/router', 'kptlrouter') if self.net_type == 'tcp': self.add_module('portals/linux/socknal', 'ksocknal') if self.net_type == 'elan': self.add_module('portals/linux/rqswnal', 'kqswnal') if self.net_type == 'gm': self.add_module('portals/linux/gmnal', 'kgmnal') self.add_module('lustre/obdclass', 'obdclass') self.add_module('lustre/ptlrpc', 'ptlrpc') def prepare(self): self.info(self.net_type, self.nid, self.port) if self.net_type == 'tcp': ret = run_daemon(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port) if ret: raise CommandError(TCP_ACCEPTOR, 'failed', ret) ret = self.dom_node.getElementsByTagName('route_tbl') for a in ret: for r in a.getElementsByTagName('route'): net_type = get_attr(r, 'type') gw = get_attr(r, 'gw') lo = get_attr(r, 'lo') hi = get_attr(r,'hi', '') lctl.add_route(net_type, gw, lo, hi) if self.net_type == 'tcp' and hi == '': srv = nid2server(self.dom_node.parentNode.parentNode, lo) if not srv: panic("no server for nid", lo) else: lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) lctl.network(self.net_type, self.nid) lctl.newdev(attach = "ptlrpc RPCDEV") def cleanup(self): self.info(self.net_type, self.nid, self.port) ret = self.dom_node.getElementsByTagName('route_tbl') for a in ret: for r in a.getElementsByTagName('route'): lo = get_attr(r, 'lo') hi = get_attr(r,'hi', '') if self.net_type == 'tcp' and hi == '': srv = nid2server(self.dom_node.parentNode.parentNode, lo) if not srv: panic("no server for nid", lo) else: try: lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) except CommandError, e: print "disconnect failed: ", self.name e.dump() try: lctl.del_route(self.net_type, self.nid, lo, hi) except CommandError, e: print "del_route failed: ", self.name e.dump() try: lctl.cleanup("RPCDEV", "") except CommandError, e: print "cleanup failed: ", self.name e.dump() try: lctl.disconnectAll(self.net_type) except CommandError, e: print "disconnectAll failed: ", self.name e.dump() if self.net_type == 'tcp': # yikes, this ugly! need to save pid in /var/something run("killall acceptor") class LDLM(Module): def __init__(self,dom_node): Module.__init__(self, 'LDLM', dom_node) self.add_module('lustre/ldlm', 'ldlm') def prepare(self): self.info() lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid), setup ="") class LOV(Module): def __init__(self,dom_node): Module.__init__(self, 'LOV', dom_node) self.mdsuuid = get_first_ref(dom_node, 'mds') mds= lookup(dom_node.parentNode, self.mdsuuid) self.mdsname = getName(mds) devs = dom_node.getElementsByTagName('devices') if len(devs) > 0: dev_node = devs[0] self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536) self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0) self.pattern = get_attr_int(dev_node, 'pattern', 0) self.devlist = get_all_refs(dev_node, 'osc') self.stripe_cnt = len(self.devlist) def prepare(self): self.info(self.mdsuuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist, self.mdsname) lctl.lovconfig(self.uuid, self.mdsname, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, string.join(self.devlist)) class MDS(Module): def __init__(self,dom_node): Module.__init__(self, 'MDS', dom_node) self.devname, self.size = get_device(dom_node) self.fstype = get_text(dom_node, 'fstype') self.format = get_text(dom_node, 'autoformat', "no") if self.fstype == 'extN': self.add_module('lustre/extN', 'extN') self.add_module('lustre/mds', 'mds') self.add_module('lustre/mds', 'mds_%s' % (self.fstype)) def prepare(self): self.info(self.devname, self.fstype, self.format) blkdev = block_dev(self.devname, self.size, self.fstype, self.format) lctl.newdev(attach="mds %s %s" % (self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) def cleanup(self): Module.cleanup(self) clean_loop(self.devname) class MDC(Module): def __init__(self,dom_node): Module.__init__(self, 'MDC', dom_node) self.mds_uuid = get_first_ref(dom_node, 'mds') self.lookup_server(self.mds_uuid) self.add_module('lustre/mdc', 'mdc') def prepare(self): self.info(self.mds_uuid) srv = self.get_server() lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid), setup ="%s %s" %(self.mds_uuid, srv.uuid)) class OBD(Module): def __init__(self, dom_node): Module.__init__(self, 'OBD', dom_node) self.obdtype = get_attr(dom_node, 'type') self.devname, self.size = get_device(dom_node) self.fstype = get_text(dom_node, 'fstype') self.format = get_text(dom_node, 'autoformat', 'yes') if self.fstype == 'extN': self.add_module('lustre/extN', 'extN') self.add_module('lustre/' + self.obdtype, self.obdtype) # need to check /proc/mounts and /etc/mtab before # formatting anything. # FIXME: check if device is already formatted. def prepare(self): self.info(self.obdtype, self.devname, self.size, self.fstype, self.format) if self.obdtype == 'obdecho': blkdev = '' else: blkdev = block_dev(self.devname, self.size, self.fstype, self.format) lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) def cleanup(self): Module.cleanup(self) if not self.obdtype == 'obdecho': clean_loop(self.devname) class OST(Module): def __init__(self,dom_node): Module.__init__(self, 'OST', dom_node) self.obd_uuid = get_first_ref(dom_node, 'obd') self.add_module('lustre/ost', 'ost') def prepare(self): self.info(self.obd_uuid) lctl.newdev(attach="ost %s %s" % (self.name, self.uuid), setup ="%s" % (self.obd_uuid)) class OSC(Module): def __init__(self,dom_node): Module.__init__(self, 'OSC', dom_node) self.obd_uuid = get_first_ref(dom_node, 'obd') self.ost_uuid = get_first_ref(dom_node, 'ost') self.lookup_server(self.ost_uuid) self.add_module('lustre/osc', 'osc') def prepare(self): self.info(self.obd_uuid, self.ost_uuid) srv = self.get_server() if local_net(srv): lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem) else: r = find_route(srv) if r: lctl.add_route_host(r[0], srv.uuid, r[1], r[2]) else: panic ("no route to", srv.nid) lctl.newdev(attach="osc %s %s" % (self.name, self.uuid), setup ="%s %s" %(self.obd_uuid, srv.uuid)) class Mountpoint(Module): def __init__(self,dom_node): Module.__init__(self, 'MTPT', dom_node) self.path = get_text(dom_node, 'path') self.mdc_uuid = get_first_ref(dom_node, 'mdc') self.lov_uuid = get_first_ref(dom_node, 'osc') self.add_module('lustre/osc', 'osc') # should add lov only if needed self.add_module('lustre/lov', 'lov') self.add_module('lustre/llite', 'llite') def prepare(self): l = lookup(self.dom_node.parentNode, self.lov_uuid) if l.nodeName == 'lov': lov = LOV(l) for osc_uuid in lov.devlist: osc = lookup(self.dom_node.parentNode, osc_uuid) if osc: n = OSC(osc) n.prepare() else: panic('osc not found:', osc_uuid) lctl.newdev(attach="lov %s %s" % (lov.name, lov.uuid), setup ="%s" % (self.mdc_uuid)) else: osc = OSC(l) osc.prepare() self.info(self.path, self.mdc_uuid,self.lov_uuid) cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ (self.lov_uuid, self.mdc_uuid, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: panic("mount failed:", self.path) def cleanup(self): self.info(self.path, self.mdc_uuid,self.lov_uuid) (rc, out) = run("umount", self.path) if rc: log("umount failed, cleanup will most likely not work.") l = lookup(self.dom_node.parentNode, self.lov_uuid) if l.nodeName == 'lov': lov = LOV(l) for osc_uuid in lov.devlist: osc = lookup(self.dom_node.parentNode, osc_uuid) if osc: n = OSC(osc) n.cleanup() else: panic('osc not found:', osc_uuid) else: osc = OSC(l) osc.cleanup() # ============================================================ # XML processing and query # TODO: Change query funcs to use XPath, which is muc cleaner def get_device(obd): list = obd.getElementsByTagName('device') if len(list) > 0: dev = list[0] dev.normalize(); size = get_attr_int(dev, 'size', 0) return dev.firstChild.data, size return '', 0 # Get the text content from the first matching child # If there is no content (or it is all whitespace), return # the default def get_text(dom_node, tag, default=""): list = dom_node.getElementsByTagName(tag) if len(list) > 0: dom_node = list[0] dom_node.normalize() if dom_node.firstChild: txt = string.strip(dom_node.firstChild.data) if txt: return txt return default def get_text_int(dom_node, tag, default=0): list = dom_node.getElementsByTagName(tag) n = default if len(list) > 0: dom_node = list[0] dom_node.normalize() if dom_node.firstChild: txt = string.strip(dom_node.firstChild.data) if txt: try: n = int(txt) except ValueError: panic("text value is not integer:", txt) return n def get_attr(dom_node, attr, default=""): v = dom_node.getAttribute(attr) if v: return v return default def get_attr_int(dom_node, attr, default=0): n = default v = dom_node.getAttribute(attr) if v: try: n = int(v) except ValueError: panic("attr value is not integer", v) return n def get_first_ref(dom_node, tag): """ Get the first uuidref of the type TAG. Used one only one is expected. Returns the uuid.""" uuid = None refname = '%s_ref' % tag list = dom_node.getElementsByTagName(refname) if len(list) > 0: uuid = getRef(list[0]) return uuid def get_all_refs(dom_node, tag): """ Get all the refs of type TAG. Returns list of uuids. """ uuids = [] refname = '%s_ref' % tag list = dom_node.getElementsByTagName(refname) if len(list) > 0: for i in list: uuids.append(getRef(i)) return uuids def get_ost_net(dom_node, uuid): ost = lookup(dom_node, uuid) uuid = get_first_ref(ost, 'network') if not uuid: return None return lookup(dom_node, uuid) def nid2server(dom_node, nid): netlist = dom_node.getElementsByTagName('network') for net_node in netlist: if get_text(net_node, 'server') == nid: return Network(net_node) return None def lookup(dom_node, uuid): for n in dom_node.childNodes: if n.nodeType == n.ELEMENT_NODE: if getUUID(n) == uuid: return n else: n = lookup(n, uuid) if n: return n return None # Get name attribute of dom_node def getName(dom_node): return dom_node.getAttribute('name') def getRef(dom_node): return dom_node.getAttribute('uuidref') # Get name attribute of dom_node def getUUID(dom_node): return dom_node.getAttribute('uuid') # the tag name is the service type # fixme: this should do some checks to make sure the dom_node is a service def getServiceType(dom_node): return dom_node.nodeName # # determine what "level" a particular node is at. # the order of iniitailization is based on level. def getServiceLevel(dom_node): type = getServiceType(dom_node) if type in ('network',): return 10 elif type in ('device', 'ldlm'): return 20 elif type in ('obd', 'mdd'): return 30 elif type in ('mds','ost'): return 40 elif type in ('mdc','osc'): return 50 elif type in ('lov',): return 60 elif type in ('mountpoint',): return 70 return 0 # # return list of services in a profile. list is a list of tuples # [(level, dom_node),] def getServices(lustreNode, profileNode): list = [] for n in profileNode.childNodes: if n.nodeType == n.ELEMENT_NODE: servNode = lookup(lustreNode, getRef(n)) if not servNode: print n panic('service not found: ' + getRef(n)) level = getServiceLevel(servNode) list.append((level, servNode)) list.sort() return list def getByName(lustreNode, name, tag): ndList = lustreNode.getElementsByTagName(tag) for nd in ndList: if getName(nd) == name: return nd return None ############################################################ # routing ("rooting") # routes = [] local_node = [] router_flag = 0 def init_node(dom_node): global local_node, router_flag netlist = dom_node.getElementsByTagName('network') for dom_net in netlist: type = get_attr(dom_net, 'type') gw = get_text(dom_net, 'server') local_node.append((type, gw)) def node_needs_router(): return router_flag def get_routes(type, gw, dom_net): """ Return the routes as a list of tuples of the form: [(type, gw, lo, hi),]""" res = [] tbl = dom_net.getElementsByTagName('route_tbl') for t in tbl: routes = t.getElementsByTagName('route') for r in routes: lo = get_attr(r, 'lo') hi = get_attr(r, 'hi', '') res.append((type, gw, lo, hi)) return res def init_route_config(lustre): """ Scan the lustre config looking for routers. Build list of routes. """ global routes, router_flag routes = [] list = lustre.getElementsByTagName('node') for node in list: if get_attr(node, 'router'): router_flag = 1 for (local_type, local_nid) in local_node: gw = None netlist = node.getElementsByTagName('network') for dom_net in netlist: if local_type == get_attr(dom_net, 'type'): gw = get_text(dom_net, 'server') break if not gw: continue for dom_net in netlist: if local_type != get_attr(dom_net, 'type'): for route in get_routes(local_type, gw, dom_net): routes.append(route) def local_net(net): global local_node for iface in local_node: if net.net_type == iface[0]: return 1 return 0 def find_route(net): global local_node, routes frm_type = local_node[0][0] to_type = net.net_type to = net.nid debug ('looking for route to', to_type,to) for r in routes: if r[2] == to: return r return None ############################################################ # lconf level logic # Start a service. def startService(dom_node, module_flag): type = getServiceType(dom_node) debug('Service:', type, getName(dom_node), getUUID(dom_node)) # there must be a more dynamic way of doing this... n = None if type == 'ldlm': n = LDLM(dom_node) elif type == 'lov': n = LOV(dom_node) elif type == 'network': n = Network(dom_node) elif type == 'obd': n = OBD(dom_node) elif type == 'ost': n = OST(dom_node) elif type == 'mds': n = MDS(dom_node) elif type == 'osc': n = OSC(dom_node) elif type == 'mdc': n = MDC(dom_node) elif type == 'mountpoint': n = Mountpoint(dom_node) else: panic ("unknown service type:", type) if module_flag: if config.nomod(): return if config.cleanup(): n.cleanup_module() else: n.load_module() else: if config.nosetup(): return if config.cleanup(): n.cleanup() else: n.prepare() # # Prepare the system to run lustre using a particular profile # in a the configuration. # * load & the modules # * setup networking for the current node # * make sure partitions are in place and prepared # * initialize devices with lctl # Levels is important, and needs to be enforced. def startProfile(lustreNode, profileNode, module_flag): if not profileNode: panic("profile:", profile, "not found.") services = getServices(lustreNode, profileNode) if config.cleanup(): services.reverse() for s in services: startService(s[1], module_flag) # # Load profile for def doHost(lustreNode, hosts): global routes dom_node = None for h in hosts: dom_node = getByName(lustreNode, h, 'node') if dom_node: break if not dom_node: print 'No host entry found.' return if not get_attr(dom_node, 'router'): init_node(dom_node) init_route_config(lustreNode) else: global router_flag router_flag = 1 # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. module_flag = not config.cleanup() reflist = dom_node.getElementsByTagName('profile') for profile in reflist: startProfile(lustreNode, profile, module_flag) if not config.cleanup(): sys_set_debug_path() script = config.gdb_script() run(lctl.lctl, ' modules >', script) if config.gdb(): # dump /tmp/ogdb and sleep/pause here log ("The GDB module script is in", script) time.sleep(5) module_flag = not module_flag for profile in reflist: startProfile(lustreNode, profile, module_flag) ############################################################ # Command line processing # def parse_cmdline(argv): short_opts = "hdnv" long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb", "portals=", "makeldiff", "cleanup", "noexec", "help", "node=", "get=", "nomod", "nosetup"] opts = [] args = [] try: opts, args = getopt.getopt(argv, short_opts, long_opts) except getopt.error: print "invalid opt" usage() for o, a in opts: if o in ("-h", "--help"): usage() if o in ("-d","--cleanup"): config.cleanup(1) if o in ("-v", "--verbose"): config.verbose(1) if o in ("-n", "--noexec"): config.noexec(1) config.verbose(1) if o == "--portals": config.portals = a if o == "--lustre": config.lustre = a if o == "--reformat": config.reformat(1) if o == "--node": config.node(a) if o == "--get": config.url(a) if o == "--gdb": config.gdb(1) if o == "--nomod": config.nomod(1) if o == "--nosetup": config.nosetup(1) return args def fetch(url): import urllib data = "" try: s = urllib.urlopen(url) data = s.read() except: usage() return data def setupModulePath(cmd): base = os.path.dirname(cmd) if os.access(base+"/Makefile", os.R_OK): config.src_dir(base + "/../../") def sys_set_debug_path(): debug("debug path: ", config.debug_path()) if config.noexec(): return try: fp = open('/proc/sys/portals/debug_path', 'w') fp.write(config.debug_path()) fp.close() except IOError, e: print e #/proc/sys/net/core/rmem_max #/proc/sys/net/core/wmem_max def sys_set_netmem_max(path, max): debug("setting", path, "to at least", max) if config.noexec(): return fp = open(path) str = fp.readline() fp.close cur = int(str) if max > cur: fp = open(path, 'w') fp.write('%d\n' %(max)) fp.close() def sys_make_devices(): if not os.access('/dev/portals', os.R_OK): run('mknod /dev/portals c 10 240') if not os.access('/dev/obd', os.R_OK): run('mknod /dev/obd c 10 241') # Initialize or shutdown lustre according to a configuration file # * prepare the system for lustre # * configure devices with lctl # Shutdown does steps in reverse # def main(): global TCP_ACCEPTOR, lctl, MAXTCPBUF host = socket.gethostname() args = parse_cmdline(sys.argv[1:]) if len(args) > 0: if not os.access(args[0], os.R_OK | os.W_OK): print 'File not found:', args[0] sys.exit(1) dom = xml.dom.minidom.parse(args[0]) elif config.url(): xmldata = fetch(config.url()) dom = xml.dom.minidom.parseString(xmldata) else: usage() node_list = [] if config.node(): node_list.append(config.node()) else: if len(host) > 0: node_list.append(host) node_list.append('localhost') debug("configuring for host: ", node_list) if len(host) > 0: config._debug_path = '/tmp/lustre-log-' + host TCP_ACCEPTOR = find_prog('acceptor') if not TCP_ACCEPTOR: if config.noexec(): TCP_ACCEPTOR = 'acceptor' debug('! acceptor not found') else: panic('acceptor not found') lctl = LCTLInterface('lctl') setupModulePath(sys.argv[0]) sys_make_devices() sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) doHost(dom.documentElement, node_list) if __name__ == "__main__": try: main() except LconfError, e: print e except CommandError, e: e.dump()