#!/usr/bin/env python # # Copyright (C) 2002 Cluster File Systems, Inc. # Author: Robert Read # This file is part of Lustre, http://www.lustre.org. # # Lustre is free software; you can redistribute it and/or # modify it under the terms of version 2 of the GNU General Public # License as published by the Free Software Foundation. # # Lustre is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Lustre; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # lconf - lustre configuration tool # # lconf is the main driver script for starting and stopping # lustre filesystem services. # # Based in part on the XML obdctl modifications done by Brian Behlendorf import sys, getopt, types import string, os, stat, popen2, socket, time, random, fcntl, select import re, exceptions import xml.dom.minidom if sys.version[0] == '1': from FCNTL import F_GETFL, F_SETFL else: from fcntl import F_GETFL, F_SETFL # Global parameters MAXTCPBUF = 1048576 DEFAULT_TCPBUF = 1048576 # # Maximum number of devices to search for. # (the /dev/loop* nodes need to be created beforehand) MAX_LOOP_DEVICES = 256 PORTALS_DIR = '@PORTALSLOC@' first_cleanup_error = 0 def cleanup_error(rc): global first_cleanup_error if not first_cleanup_error: first_cleanup_error = rc def usage(): print """usage: lconf config.xml config.xml Lustre configuration in xml format. --ldapurl LDAP server URL, eg. ldap://localhost --config Cluster config name used for LDAP query --node Load config for --select service=nodeA,service2=nodeB U -d | --cleanup Cleans up config. (Shutdown) -f | --force Forced unmounting and/or obd detach during cleanup -v | --verbose Print system commands as they are run -h | --help Print this help --gdb Prints message after creating gdb module script and sleeps for 5 seconds. -n | --noexec Prints the commands and steps that will be run for a config without executing them. This can used to check if a config file is doing what it should be doing. (Implies -v) --nomod Skip load/unload module step. --nosetup Skip device setup/cleanup step. --reformat Reformat all devices (without question) --dump Dump the kernel debug log before portals is unloaded --minlevel Specify the minimum level of services to configure/cleanup (default 0) --maxlevel Specify the maximum level of services to configure/cleanup (default 100) Levels are aproximatly like: 10 - network 20 - device, ldlm 30 - osd, mdd 40 - mds, ost 50 - mdc, osc 60 - lov 70 - mountpoint, echo_client --lustre=src_dir Base directory of lustre sources. This parameter will cause lconf to load modules from a source tree. --portals=src_dir Portals source directory. If this is a relative path, then it is assumed to be relative to lustre. """ TODO = """ --ldap server LDAP server with lustre config database --makeldiff Translate xml source to LDIFF This are perhaps not needed: """ sys.exit() # ============================================================ # Config parameters, encapsulated in a class class Config: def __init__(self): # flags self._noexec = 0 self._verbose = 0 self._reformat = 0 self._cleanup = 0 self._gdb = 0 self._nomod = 0 self._nosetup = 0 self._force = 0 # parameters self._modules = None self._node = None self._url = None self._gdb_script = '/tmp/ogdb' self._debug_path = '/tmp/lustre-log' self._dump_file = None self._lustre_dir = '' self._portals_dir = '' self._minlevel = 0 self._maxlevel = 100 self._timeout = 0 self._recovery_upcall = '' self._ldapurl = '' self._config_name = '' self._select = {} self._lctl_dump = '' def verbose(self, flag = None): if flag: self._verbose = flag return self._verbose def noexec(self, flag = None): if flag: self._noexec = flag return self._noexec def reformat(self, flag = None): if flag: self._reformat = flag return self._reformat def cleanup(self, flag = None): if flag: self._cleanup = flag return self._cleanup def gdb(self, flag = None): if flag: self._gdb = flag return self._gdb def nomod(self, flag = None): if flag: self._nomod = flag return self._nomod def nosetup(self, flag = None): if flag: self._nosetup = flag return self._nosetup def force(self, flag = None): if flag: self._force = flag return self._force def node(self, val = None): if val: self._node = val return self._node def gdb_script(self): if os.path.isdir('/r'): return '/r' + self._gdb_script else: return self._gdb_script def debug_path(self): if os.path.isdir('/r'): return '/r' + self._debug_path else: return self._debug_path def dump_file(self, val = None): if val: self._dump_file = val return self._dump_file def minlevel(self, val = None): if val: self._minlevel = int(val) return self._minlevel def maxlevel(self, val = None): if val: self._maxlevel = int(val) return self._maxlevel def portals_dir(self, val = None): if val: self._portals_dir = val return self._portals_dir def lustre_dir(self, val = None): if val: self._lustre_dir = val return self._lustre_dir def timeout(self, val = None): if val: self._timeout = val return self._timeout def recovery_upcall(self, val = None): if val: self._recovery_upcall = val return self._recovery_upcall def ldapurl(self, val = None): if val: self._ldapurl = val return self._ldapurl def config_name(self, val = None): if val: self._config_name = val return self._config_name def init_select(self, arg): # arg = "service=nodeA,service2=nodeB" list = string.split(arg, ',') for entry in list: srv, node = string.split(entry, '=') self._select[srv] = node def select(self, srv): if self._select.has_key(srv): return self._select[srv] return None def lctl_dump(self, val = None): if val: self._lctl_dump = val return self._lctl_dump config = Config() # ============================================================ # debugging and error funcs def fixme(msg = "this feature"): raise LconfError, msg + ' not implmemented yet.' def panic(*args): msg = string.join(map(str,args)) if not config.noexec(): raise LconfError(msg) else: print "! " + msg def log(*args): msg = string.join(map(str,args)) print msg def logall(msgs): for s in msgs: print string.strip(s) def debug(*args): if config.verbose(): msg = string.join(map(str,args)) print msg # ============================================================ # locally defined exceptions class CommandError (exceptions.Exception): def __init__(self, cmd_name, cmd_err, rc=None): self.cmd_name = cmd_name self.cmd_err = cmd_err self.rc = rc def dump(self): import types if type(self.cmd_err) == types.StringType: if self.rc: print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err) else: print "! %s: %s" % (self.cmd_name, self.cmd_err) elif type(self.cmd_err) == types.ListType: if self.rc: print "! %s (error %d):" % (self.cmd_name, self.rc) else: print "! %s:" % (self.cmd_name) for s in self.cmd_err: print "> %s" %(string.strip(s)) else: print self.cmd_err class LconfError (exceptions.Exception): def __init__(self, args): self.args = args # ============================================================ # handle daemons, like the acceptor class DaemonHandler: """ Manage starting and stopping a daemon. Assumes daemon manages it's own pid file. """ def __init__(self, cmd): self.command = cmd self.path ="" def start(self): if self.running(): log(self.command, "already running.") if not self.path: self.path = find_prog(self.command) if not self.path: panic(self.command, "not found.") ret, out = runcmd(self.path +' '+ self.command_line()) if ret: raise CommandError(self.path, out, ret) def stop(self): if self.running(): pid = self.read_pidfile() try: log ("killing process", pid) os.kill(pid, 15) #time.sleep(1) # let daemon die except OSError, e: log("unable to kill", self.command, e) if self.running(): log("unable to kill", self.command) def running(self): pid = self.read_pidfile() if pid: try: os.kill(pid, 0) except OSError: self.clean_pidfile() else: return 1 return 0 def read_pidfile(self): try: fp = open(self.pidfile(), 'r') pid = int(fp.read()) fp.close() return pid except IOError: return 0 def clean_pidfile(self): """ Remove a stale pidfile """ log("removing stale pidfile:", self.pidfile()) try: os.unlink(self.pidfile()) except OSError, e: log(self.pidfile(), e) class AcceptorHandler(DaemonHandler): def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg): DaemonHandler.__init__(self, "acceptor") self.port = port self.flags = '' self.send_mem = send_mem self.recv_mem = recv_mem if net_type == 'toe': self.flags = self.flags + ' -N 4' if irq_aff: self.flags = self.flags + ' -i' if nid_xchg: self.flags = self.flags + ' -x' def pidfile(self): return "/var/run/%s-%d.pid" % (self.command, self.port) def command_line(self): return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port))) acceptors = {} # start the acceptors def run_acceptors(): for port in acceptors.keys(): daemon = acceptors[port] if not daemon.running(): daemon.start() def stop_acceptor(port): if acceptors.has_key(port): daemon = acceptors[port] if daemon.running(): daemon.stop() # ============================================================ # handle lctl interface class LCTLInterface: """ Manage communication with lctl """ def __init__(self, cmd): """ Initialize close by finding the lctl binary. """ self.lctl = find_prog(cmd) self.save_file = '' if not self.lctl: if config.noexec(): debug('! lctl not found') self.lctl = 'lctl' else: raise CommandError('lctl', "unable to find lctl binary.") def use_save_file(self, file): self.save_file = file def set_nonblock(self, fd): fl = fcntl.fcntl(fd, F_GETFL) fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY) def run(self, cmds): """ run lctl the cmds are written to stdin of lctl lctl doesn't return errors when run in script mode, so stderr is checked should modify command line to accept multiple commands, or create complex command line options """ cmd_line = self.lctl if self.save_file: cmds = '\n dump ' + self.save_file + cmds debug("+", cmd_line, cmds) if config.noexec(): return (0, []) child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command child.tochild.write(cmds + "\n") child.tochild.close() # From "Python Cookbook" from O'Reilly outfile = child.fromchild outfd = outfile.fileno() self.set_nonblock(outfd) errfile = child.childerr errfd = errfile.fileno() self.set_nonblock(errfd) outdata = errdata = '' outeof = erreof = 0 while 1: ready = select.select([outfd,errfd],[],[]) # Wait for input if outfd in ready[0]: outchunk = outfile.read() if outchunk == '': outeof = 1 outdata = outdata + outchunk if errfd in ready[0]: errchunk = errfile.read() if errchunk == '': erreof = 1 errdata = errdata + errchunk if outeof and erreof: break # end of "borrowed" code ret = child.wait() if os.WIFEXITED(ret): rc = os.WEXITSTATUS(ret) else: rc = 0 if rc or len(errdata): raise CommandError(self.lctl, errdata, rc) return rc, outdata def runcmd(self, *args): """ run lctl using the command line """ cmd = string.join(map(str,args)) debug("+", self.lctl, cmd) rc, out = run(self.lctl, cmd) if rc: raise CommandError(self.lctl, out, rc) return rc, out def network(self, net, nid): """ initialized network and add "self" """ # Idea: "mynid" could be used for all network types to add "self," and then # this special case would be gone and the "self" hack would be hidden. if net in ('tcp', 'toe'): cmds = """ network %s mynid %s quit """ % (net, nid) self.run(cmds) # create a new connection def connect(self, srv): cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type) if srv.net_type in ('tcp', 'toe') and not config.lctl_dump(): flags = '' if srv.irq_affinity: flags = flags + 'i' if srv.nid_exchange: flags = flags + 'x' cmds = """%s network %s send_mem %d recv_mem %d connect %s %d %s""" % (cmds, srv.net_type, srv.send_mem, srv.recv_mem, srv.hostaddr, srv.port, flags ) cmds = cmds + "\n quit" self.run(cmds) # add a route to a range def add_route(self, net, gw, lo, hi): cmds = """ network %s add_route %s %s %s quit """ % (net, gw, lo, hi) self.run(cmds) def del_route(self, net, gw, lo, hi): cmds = """ ignore_errors network %s del_route %s quit """ % (net, lo) self.run(cmds) # add a route to a host def add_route_host(self, net, uuid, gw, tgt): cmds = """ network %s add_uuid %s %s %s add_route %s %s quit """ % (net, uuid, tgt, net, gw, tgt) self.run(cmds) # add a route to a range def del_route_host(self, net, uuid, gw, tgt): cmds = """ ignore_errors network %s del_uuid %s del_route %s quit """ % (net, uuid, tgt) self.run(cmds) # disconnect one connection def disconnect(self, net, nid, port, servuuid): cmds = """ ignore_errors network %s disconnect %s del_uuid %s quit""" % (net, nid, servuuid) self.run(cmds) # disconnect all def disconnectAll(self, net): cmds = """ ignore_errors network %s disconnect quit""" % (net) self.run(cmds) # create a new device with lctl def newdev(self, attach, setup = ""): cmds = """ newdev attach %s setup %s quit""" % (attach, setup) self.run(cmds) # cleanup a device def cleanup(self, name, uuid): cmds = """ ignore_errors device $%s cleanup %s detach quit""" % (name, ('', 'force')[config.force()]) self.run(cmds) # create an lov def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist): cmds = """ device $%s probe lov_setconfig %s %d %d %d %s %s quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist) self.run(cmds) # dump the log file def dump(self, dump_file): cmds = """ debug_kernel %s 1 quit""" % (dump_file) self.run(cmds) # get list of devices def device_list(self): rc, out = self.runcmd('device_list') return out # get lustre version def lustre_version(self): rc, out = self.runcmd('version') return out # ============================================================ # Various system-level functions # (ideally moved to their own module) # Run a command and return the output and status. # stderr is sent to /dev/null, could use popen3 to # save it if necessary def runcmd(cmd): debug ("+", cmd) if config.noexec(): return (0, []) f = os.popen(cmd + ' 2>&1') out = f.readlines() ret = f.close() if ret: ret = ret >> 8 else: ret = 0 return (ret, out) def run(*args): cmd = string.join(map(str,args)) return runcmd(cmd) # Run a command in the background. def run_daemon(*args): cmd = string.join(map(str,args)) debug ("+", cmd) if config.noexec(): return 0 f = os.popen(cmd + ' 2>&1') ret = f.close() if ret: ret = ret >> 8 else: ret = 0 return ret # Determine full path to use for an external command # searches dirname(argv[0]) first, then PATH def find_prog(cmd): syspath = string.split(os.environ['PATH'], ':') cmdpath = os.path.dirname(sys.argv[0]) syspath.insert(0, cmdpath); if config.portals_dir(): syspath.insert(0, os.path.join(config.portals_dir()+'/linux/utils/')) for d in syspath: prog = os.path.join(d,cmd) if os.access(prog, os.X_OK): return prog return '' # Recursively look for file starting at base dir def do_find_file(base, mod): fullname = os.path.join(base, mod) if os.access(fullname, os.R_OK): return fullname for d in os.listdir(base): dir = os.path.join(base,d) if os.path.isdir(dir): module = do_find_file(dir, mod) if module: return module def find_module(src_dir, dev_dir, modname): mod = '%s.o' % (modname) module = src_dir +'/'+ dev_dir +'/'+ mod try: if os.access(module, os.R_OK): return module except OSError: pass return None # is the path a block device? def is_block(path): s = () try: s = os.stat(path) except OSError: return 0 return stat.S_ISBLK(s[stat.ST_MODE]) # build fs according to type # fixme: dangerous def mkfs(dev, devsize, fstype): block_cnt = '' if devsize: # devsize is in 1k, and fs block count is in 4k block_cnt = devsize/4 if(fstype in ('ext3', 'extN')): mkfs = 'mkfs.ext2 -j -b 4096 -F ' elif (fstype == 'reiserfs'): mkfs = 'mkreiserfs -ff' else: print 'unsupported fs type: ', fstype (ret, out) = run (mkfs, dev, block_cnt) if ret: panic("Unable to build fs:", dev) # enable hash tree indexing on fsswe # FIXME: this check can probably go away on 2.5 if fstype == 'extN': htree = 'echo "feature FEATURE_C5" | debugfs -w' (ret, out) = run (htree, dev) if ret: panic("Unable to enable htree:", dev) # some systems use /dev/loopN, some /dev/loop/N def loop_base(): import re loop = '/dev/loop' if not os.access(loop + str(0), os.R_OK): loop = loop + '/' if not os.access(loop + str(0), os.R_OK): panic ("can't access loop devices") return loop # find loop device assigned to thefile def find_loop(file): loop = loop_base() for n in xrange(0, MAX_LOOP_DEVICES): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) if (out and stat == 0): m = re.search(r'\((.*)\)', out[0]) if m and file == m.group(1): return dev else: break return '' # create file if necessary and assign the first free loop device def init_loop(file, size, fstype): dev = find_loop(file) if dev: print 'WARNING file:', file, 'already mapped to', dev return dev if config.reformat() or not os.access(file, os.R_OK | os.W_OK): if size < 8000: panic(file, "size must be larger than 8MB, currently set to:", size) (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file)) if ret: panic("Unable to create backing store:", file) loop = loop_base() # find next free loop for n in xrange(0, MAX_LOOP_DEVICES): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) if (stat): run('losetup', dev, file) return dev else: print "out of loop devices" return '' print "out of loop devices" return '' # undo loop assignment def clean_loop(file): dev = find_loop(file) if dev: ret, out = run('losetup -d', dev) if ret: log('unable to clean loop device:', dev, 'for file:', file) logall(out) # determine if dev is formatted as a filesystem def need_format(fstype, dev): # FIXME don't know how to implement this return 0 # initialize a block device if needed def block_dev(dev, size, fstype, format): if config.noexec(): return dev if not is_block(dev): dev = init_loop(dev, size, fstype) if config.reformat() or (need_format(fstype, dev) and format == 'yes'): mkfs(dev, size, fstype) # else: # panic("device:", dev, # "not prepared, and autoformat is not set.\n", # "Rerun with --reformat option to format ALL filesystems") return dev def if2addr(iface): """lookup IP address for an interface""" rc, out = run("/sbin/ifconfig", iface) if rc or not out: return None addr = string.split(out[1])[1] ip = string.split(addr, ':')[1] return ip def get_local_nid(net_type, wildcard): """Return the local nid. First look for an elan interface, then use the local address. """ local = "" if os.access('/proc/elan/device0/position', os.R_OK): local = get_local_address('elan', '*') else: local = get_local_address(net_type, wildcard) return local def get_local_address(net_type, wildcard): """Return the local address for the network type.""" local = "" if net_type in ('tcp', 'toe'): if ':' in wildcard: iface, star = string.split(wildcard, ':') local = if2addr(iface) if not local: panic ("unable to determine ip for:", wildcard) else: host = socket.gethostname() local = socket.gethostbyname(host) elif net_type == 'elan': # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position' try: fp = open('/proc/elan/device0/position', 'r') lines = fp.readlines() fp.close() for l in lines: a = string.split(l) if a[0] == 'NodeId': local = a[1] break except IOError, e: log(e) elif net_type == 'gm': fixme("automatic local address for GM") return local def is_prepared(uuid): """Return true if a device exists for the uuid""" # expect this format: # 1 UP ldlm ldlm ldlm_UUID 2 if config.lctl_dump(): return 0 try: out = lctl.device_list() for s in out: if uuid == string.split(s)[4]: return 1 except CommandError, e: e.dump() return 0 def is_network_prepared(): """If the PTLRPC device exists, then assumet that all networking has been configured""" if config.lctl_dump(): return 0 try: out = lctl.device_list() for s in out: if 'RPCDEV_UUID' == string.split(s)[4]: return 1 except CommandError, e: e.dump() return 0 def fs_is_mounted(path): """Return true if path is a mounted lustre filesystem""" try: fp = open('/proc/mounts') lines = fp.readlines() fp.close() for l in lines: a = string.split(l) if a[1] == path and a[2] == 'lustre_lite': return 1 except IOError, e: log(e) return 0 # ============================================================ # Classes to prepare and cleanup the various objects # class Module: """ Base class for the rest of the modules. The default cleanup method is defined here, as well as some utilitiy funcs. """ def __init__(self, module_name, db): self.db = db self.module_name = module_name self.name = self.db.getName() self.uuid = self.db.getUUID() self.kmodule_list = [] self._server = None self._connected = 0 def info(self, *args): msg = string.join(map(str,args)) print self.module_name + ":", self.name, self.uuid, msg def cleanup(self): """ default cleanup, used for most modules """ self.info() try: lctl.cleanup(self.name, self.uuid) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) def add_portals_module(self, dev_dir, modname): """Append a module to list of modules to load.""" self.kmodule_list.append((config.portals_dir(), dev_dir, modname)) def add_lustre_module(self, dev_dir, modname): """Append a module to list of modules to load.""" self.kmodule_list.append((config.lustre_dir(), dev_dir, modname)) def mod_loaded(self, modname): """Check if a module is already loaded. Look in /proc/modules for it.""" fp = open('/proc/modules') lines = fp.readlines() fp.close() # please forgive my tired fingers for this one ret = filter(lambda word, mod=modname: word == mod, map(lambda line: string.split(line)[0], lines)) return ret def load_module(self): """Load all the modules in the list in the order they appear.""" for src_dir, dev_dir, mod in self.kmodule_list: # (rc, out) = run ('/sbin/lsmod | grep -s', mod) if self.mod_loaded(mod) and not config.noexec(): continue log ('loading module:', mod) if src_dir: module = find_module(src_dir, dev_dir, mod) if not module: panic('module not found:', mod) (rc, out) = run('/sbin/insmod', module) if rc: raise CommandError('insmod', out, rc) else: (rc, out) = run('/sbin/modprobe', mod) if rc: raise CommandError('modprobe', out, rc) def cleanup_module(self): """Unload the modules in the list in reverse order.""" rev = self.kmodule_list rev.reverse() for src_dir, dev_dir, mod in rev: if not self.mod_loaded(mod): continue # debug hack if mod == 'portals' and config.dump_file(): lctl.dump(config.dump_file()) log('unloading module:', mod) if config.noexec(): continue (rc, out) = run('/sbin/rmmod', mod) if rc: log('! unable to unload module:', mod) logall(out) class Network(Module): def __init__(self,db): Module.__init__(self, 'NETWORK', db) self.net_type = self.db.get_val('nettype') self.nid = self.db.get_val('nid', '*') self.port = self.db.get_val_int('port', 0) self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF) self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF) self.irq_affinity = self.db.get_val_int('irqaffinity', 0) self.nid_exchange = self.db.get_val_int('nidexchange', 0) if '*' in self.nid: self.nid = get_local_nid(self.net_type, self.nid) if not self.nid: panic("unable to set nid for", self.net_type, self.nid) debug("nid:", self.nid) self.hostaddr = self.db.get_val('hostaddr', self.nid) if '*' in self.hostaddr: self.hostaddr = get_local_address(self.net_type, self.hostaddr) if not self.nid: panic("unable to set nid for", self.net_type, self.hostaddr) debug("hostaddr:", self.hostaddr) # debug ( "hostaddr ", self.hostaddr, "net_type", self.net_type) self.add_portals_module("linux/oslib", 'portals') if node_needs_router(): self.add_portals_module("linux/router", 'kptlrouter') if self.net_type == 'tcp': self.add_portals_module("linux/socknal", 'ksocknal') if self.net_type == 'toe': self.add_portals_module("/linux/toenal", 'ktoenal') if self.net_type == 'elan': self.add_portals_module("/linux/rqswnal", 'kqswnal') if self.net_type == 'gm': self.add_portals_module("/linux/gmnal", 'kgmnal') self.add_lustre_module('obdclass', 'obdclass') def prepare(self): if is_network_prepared(): return self.info(self.net_type, self.nid, self.port) lctl.network(self.net_type, self.nid) def cleanup(self): self.info(self.net_type, self.nid, self.port) if self.net_type in ('tcp', 'toe'): stop_acceptor(self.port) try: lctl.disconnectAll(self.net_type) except CommandError, e: print "disconnectAll failed: ", self.name e.dump() cleanup_error(e.rc) class Router(Module): def __init__(self,db): Module.__init__(self, 'ROUTER', db) def prepare(self): if is_network_prepared(): return self.info() for net_type, gw, lo, hi in self.db.get_route_tbl(): lctl.add_route(net_type, gw, lo, hi) if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '': srvdb = self.db.nid2server(lo, net_type) if not srvdb: panic("no server for nid", lo) else: srv = Network(srvdb) lctl.connect(srv) def cleanup(self): for net_type, gw, lo, hi in self.db.get_route_tbl(): if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '': srvdb = self.db.nid2server(lo, net_type) if not srvdb: panic("no server for nid", lo) else: srv = Network(srvdb) try: lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) except CommandError, e: print "disconnect failed: ", self.name e.dump() cleanup_error(e.rc) try: lctl.del_route(net_type, gw, lo, hi) except CommandError, e: print "del_route failed: ", self.name e.dump() cleanup_error(e.rc) class LDLM(Module): def __init__(self,db): Module.__init__(self, 'LDLM', db) self.add_lustre_module('ldlm', 'ldlm') def prepare(self): if is_prepared(self.uuid): return self.info() lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid)) def cleanup(self): if is_prepared(self.uuid): Module.cleanup(self) class PTLRPC(Module): def __init__(self,db): Module.__init__(self, 'PTLRPC', db) self.add_lustre_module('ptlrpc', 'ptlrpc') def prepare(self): if is_prepared(self.uuid): return self.info() lctl.newdev(attach="ptlrpc %s %s" % (self.name, self.uuid)) def cleanup(self): if is_prepared(self.uuid): Module.cleanup(self) class LOV(Module): def __init__(self,db): Module.__init__(self, 'LOV', db) self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('lov', 'lov') self.mds_uuid = self.db.get_first_ref('mds') mds= self.db.lookup(self.mds_uuid) self.mds_name = mds.getName() self.stripe_sz = self.db.get_val_int('stripesize', 65536) self.stripe_off = self.db.get_val_int('stripeoffset', 0) self.pattern = self.db.get_val_int('stripepattern', 0) self.devlist = self.db.get_refs('obd') self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) self.osclist = [] self.mdc_uudi = '' for obd_uuid in self.devlist: obd = self.db.lookup(obd_uuid) osc = get_osc(obd, self.name) if osc: self.osclist.append(osc) else: panic('osc not found:', obd_uuid) def prepare(self): if is_prepared(self.uuid): return for osc in self.osclist: try: # Ignore connection failures, because the LOV will DTRT with # an unconnected OSC. osc.prepare(ignore_connect_failure=1) except CommandError: print "Error preparing OSC %s (inactive)\n" % osc.uuid self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid) self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist, self.mds_name) lctl.newdev(attach="lov %s %s" % (self.name, self.uuid), setup ="%s" % (self.mdc_uuid)) def cleanup(self): if is_prepared(self.uuid): Module.cleanup(self) for osc in self.osclist: osc.cleanup() cleanup_mdc(self.db, self.name, self.mds_uuid) def load_module(self): for osc in self.osclist: osc.load_module() break Module.load_module(self) def cleanup_module(self): Module.cleanup_module(self) for osc in self.osclist: osc.cleanup_module() break class LOVConfig(Module): def __init__(self,db): Module.__init__(self, 'LOVConfig', db) self.lov_uuid = self.db.get_first_ref('lov') l = self.db.lookup(self.lov_uuid) self.lov = LOV(l) def prepare(self): lov = self.lov self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern, lov.devlist, lov.mds_name) lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern, string.join(lov.devlist)) def cleanup(self): #nothing to do here pass class MDSDEV(Module): def __init__(self,db): Module.__init__(self, 'MDSDEV', db) self.devpath = self.db.get_val('devpath','') self.size = self.db.get_val_int('devsize', 0) self.fstype = self.db.get_val('fstype', '') # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid target_uuid = self.db.get_first_ref('target') mds = self.db.lookup(target_uuid) self.name = mds.getName() self.lovconfig_uuids = mds.get_refs('lovconfig') # FIXME: if fstype not set, then determine based on kernel version self.format = self.db.get_val('autoformat', "no") active_uuid = mds.get_active_target() if not active_uuid: panic("No target device found:", target_uuid) if active_uuid == self.uuid: self.active = 1 else: self.active = 0 self.target_dev_uuid = self.uuid self.uuid = target_uuid # modules if self.fstype == 'extN': self.add_lustre_module('extN', 'extN') self.add_lustre_module('mds', 'mds') if self.fstype: self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype)) def load_module(self): if self.active: Module.load_module(self) def prepare(self): if is_prepared(self.uuid): return if not self.active: debug(self.uuid, "not active") return self.info(self.devpath, self.fstype, self.format) run_acceptors() blkdev = block_dev(self.devpath, self.size, self.fstype, self.format) if not is_prepared('MDT_UUID'): lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'), setup ="") lctl.newdev(attach="mds %s %s" % (self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) for uuid in self.lovconfig_uuids: db = self.db.lookup(uuid) lovconfig = LOVConfig(db) lovconfig.prepare() def cleanup(self): if is_prepared('MDT_UUID'): try: lctl.cleanup("MDT", "MDT_UUID") except CommandError, e: print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) if is_prepared(self.uuid): Module.cleanup(self) clean_loop(self.devpath) class OSD(Module): def __init__(self, db): Module.__init__(self, 'OSD', db) self.osdtype = self.db.get_val('osdtype') self.devpath = self.db.get_val('devpath', '') self.size = self.db.get_val_int('devsize', 0) self.fstype = self.db.get_val('fstype', '') target_uuid = self.db.get_first_ref('target') ost = self.db.lookup(target_uuid) self.name = ost.getName() # FIXME: if fstype not set, then determine based on kernel version self.format = self.db.get_val('autoformat', 'yes') if self.fstype == 'extN': self.add_lustre_module('extN', 'extN') active_uuid = ost.get_active_target() if not active_uuid: panic("No target device found:", target_uuid) if active_uuid == self.uuid: self.active = 1 else: self.active = 0 self.target_dev_uuid = self.uuid self.uuid = target_uuid # modules self.add_lustre_module('ost', 'ost') self.add_lustre_module(self.osdtype, self.osdtype) if self.fstype: self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype)) def load_module(self): if self.active: Module.load_module(self) # need to check /proc/mounts and /etc/mtab before # formatting anything. # FIXME: check if device is already formatted. def prepare(self): if is_prepared(self.uuid): return if not self.active: debug(self.uuid, "not active") return self.info(self.osdtype, self.devpath, self.size, self.fstype, self.format) run_acceptors() if self.osdtype == 'obdecho': blkdev = '' else: blkdev = block_dev(self.devpath, self.size, self.fstype, self.format) lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid), setup ="%s %s" %(blkdev, self.fstype)) if not is_prepared('OSS_UUID'): lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'), setup ="") def cleanup(self): if is_prepared('OSS_UUID'): try: lctl.cleanup("OSS", "OSS_UUID") except CommandError, e: print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) if is_prepared(self.uuid): Module.cleanup(self) if not self.osdtype == 'obdecho': clean_loop(self.devpath) # Generic client module, used by OSC and MDC class Client(Module): def __init__(self, tgtdb, module, owner): self.target_name = tgtdb.getName() self.target_uuid = tgtdb.getUUID() self.db = tgtdb self.tgt_dev_uuid = tgtdb.get_active_target() if not self.tgt_dev_uuid: panic("No target device found for target:", self.target_name) self.kmodule_list = [] self._server = None self._connected = 0 self.module = module self.module_name = string.upper(module) self.name = '%s_%s_%s' % (self.module_name, owner, self.target_name) self.uuid = '%05x%05x_%.14s_%05x%05x' % (int(random.random() * 1048576), int(random.random() * 1048576),self.name, int(random.random() * 1048576), int(random.random() * 1048576)) self.uuid = self.uuid[0:36] self.lookup_server(self.tgt_dev_uuid) self.add_lustre_module(module, module) def lookup_server(self, srv_uuid): """ Lookup a server's network information """ self._server_nets = self.db.get_ost_net(srv_uuid) if len(self._server_nets) == 0: panic ("Unable to find a server for:", srv_uuid) def get_servers(self): return self._server_nets def prepare(self, ignore_connect_failure = 0): if is_prepared(self.uuid): return self.info(self.target_uuid) try: srv = local_net(self.get_servers()) if srv: lctl.connect(srv) else: srv, r = find_route(self.get_servers()) if srv: lctl.add_route_host(r[0], srv.uuid, r[1], r[2]) else: panic ("no route to", self.target_uuid) except CommandError: if (ignore_connect_failure == 0): pass if srv: lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid), setup ="%s %s" %(self.target_uuid, srv.uuid)) def cleanup(self): Module.cleanup(self) srv = local_net(self.get_servers()) if srv: try: lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid) except CommandError, e: log(self.module_name, "disconnect failed: ", self.name) e.dump() cleanup_error(e.rc) else: self.info(self.target_uuid) srv, r = find_route(self.get_servers()) if srv: try: lctl.del_route_host(r[0], srv.uuid, r[1], r[2]) except CommandError, e: print "del_route failed: ", self.name e.dump() cleanup_error(e.rc) class MDC(Client): def __init__(self, db, owner): Client.__init__(self, db, 'mdc', owner) class OSC(Client): def __init__(self, db, owner): Client.__init__(self, db, 'osc', owner) class COBD(Module): def __init__(self, db): Module.__init__(self, 'COBD', db) self.real_uuid = self.db.get_first_ref('realobd') self.cache_uuid = self.db.get_first_ref('cacheobd') self.add_lustre_module('cobd' , 'cobd') # need to check /proc/mounts and /etc/mtab before # formatting anything. # FIXME: check if device is already formatted. def prepare(self): if is_prepared(self.uuid): return self.info(self.real_uuid, self.cache_uuid) lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid), setup ="%s %s" %(self.real_uuid, self.cache_uuid)) # virtual interface for OSC and LOV class VOSC(Module): def __init__(self,db, owner): Module.__init__(self, 'VOSC', db) if db.get_class() == 'lov': self.osc = LOV(db) else: self.osc = get_osc(db, owner) def get_uuid(self): return self.osc.uuid def prepare(self): self.osc.prepare() def cleanup(self): self.osc.cleanup() def load_module(self): self.osc.load_module() def cleanup_module(self): self.osc.cleanup_module() def need_mdc(self): return self.db.get_class() != 'lov' def get_mdc_uuid(self): if self.db.get_class() == 'lov': return self.osc.mdc_uuid return '' class ECHO_CLIENT(Module): def __init__(self,db): Module.__init__(self, 'ECHO_CLIENT', db) self.add_lustre_module('obdecho', 'obdecho') self.obd_uuid = self.db.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) self.osc = VOSC(obd, self.name) def prepare(self): if is_prepared(self.uuid): return self.osc.prepare() # XXX This is so cheating. -p self.info(self.obd_uuid) lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid), setup = self.osc.get_uuid()) def cleanup(self): if is_prepared(self.uuid): Module.cleanup(self) self.osc.cleanup() def load_module(self): self.osc.load_module() Module.load_module(self) def cleanup_module(self): Module.cleanup_module(self) self.osc.cleanup_module() class Mountpoint(Module): def __init__(self,db): Module.__init__(self, 'MTPT', db) self.path = self.db.get_val('path') self.mds_uuid = self.db.get_first_ref('mds') self.obd_uuid = self.db.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) self.vosc = VOSC(obd, self.name) if self.vosc.need_mdc(): self.add_lustre_module('mdc', 'mdc') self.add_lustre_module('llite', 'llite') def prepare(self): self.vosc.prepare() if self.vosc.need_mdc(): mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid) else: mdc_uuid = self.vosc.get_mdc_uuid() if not mdc_uuid: panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.") self.info(self.path, self.mds_uuid, self.obd_uuid) cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \ (self.vosc.get_uuid(), mdc_uuid, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: panic("mount failed:", self.path) def cleanup(self): self.info(self.path, self.mds_uuid,self.obd_uuid) if fs_is_mounted(self.path): if config.force(): (rc, out) = run("umount", "-f", self.path) else: (rc, out) = run("umount", self.path) if rc: raise CommandError('umount', out, rc) if fs_is_mounted(self.path): panic("fs is still mounted:", self.path) self.vosc.cleanup() if self.vosc.need_mdc(): cleanup_mdc(self.db, self.name, self.mds_uuid) def load_module(self): self.vosc.load_module() Module.load_module(self) def cleanup_module(self): Module.cleanup_module(self) self.vosc.cleanup_module() # ============================================================ # XML processing and query class LustreDB: def lookup(self, uuid): """ lookup returns a new LustreDB instance""" return self._lookup_by_uuid(uuid) def lookup_name(self, name, class_name = ""): """ lookup returns a new LustreDB instance""" return self._lookup_by_name(name, class_name) def lookup_class(self, class_name): """ lookup returns a new LustreDB instance""" return self._lookup_by_class(class_name) def get_val(self, tag, default=None): v = self._get_val(tag) if v: return v if default != None: return default debug("LustreDB", self.getName(), " no value for:", tag) return None def get_class(self): return self._get_class() def get_val_int(self, tag, default=0): str = self._get_val(tag) try: if str: return int(str) return default except ValueError: panic("text value is not integer:", str) def get_first_ref(self, tag): """ Get the first uuidref of the type TAG. Only one is expected. Returns the uuid.""" uuids = self._get_refs(tag) if len(uuids) > 0: return uuids[0] return None def get_refs(self, tag): """ Get all the refs of type TAG. Returns list of uuids. """ uuids = self._get_refs(tag) return uuids def get_all_refs(self): """ Get all the refs. Returns list of uuids. """ uuids = self._get_all_refs() return uuids def get_ost_net(self, osd_uuid): srv_list = [] if not osd_uuid: return srv_list osd = self.lookup(osd_uuid) node_uuid = osd.get_first_ref('node') node = self.lookup(node_uuid) if not node: panic("unable to find node for osd_uuid:", osd_uuid, " node_ref:", node_uuid) for net_uuid in node.get_networks(): db = node.lookup(net_uuid) srv_list.append(Network(db)) return srv_list def nid2server(self, nid, net_type): netlist = self.lookup_class('network') for net_db in netlist: if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type: return net_db return None # the tag name is the service type # fixme: this should do some checks to make sure the dom_node is a service # # determine what "level" a particular node is at. # the order of iniitailization is based on level. def getServiceLevel(self): type = self.get_class() ret=0; if type in ('network',): ret = 5 elif type in ('routetbl',): ret = 6 elif type in ('ptlrpc',): ret = 7 elif type in ('device', 'ldlm'): ret = 20 elif type in ('osd', 'mdd', 'cobd'): ret = 30 elif type in ('mdsdev','ost'): ret = 40 elif type in ('mdc','osc'): ret = 50 elif type in ('lov',): ret = 60 elif type in ('mountpoint', 'echoclient'): ret = 70 if ret < config.minlevel() or ret > config.maxlevel(): ret = 0 return ret # # return list of services in a profile. list is a list of tuples # [(level, db_object),] def getServices(self): list = [] for ref_class, ref_uuid in self.get_all_refs(): servdb = self.lookup(ref_uuid) if servdb: level = servdb.getServiceLevel() if level > 0: list.append((level, servdb)) else: panic('service not found: ' + ref_uuid) list.sort() return list # Find the target_device for target on a node # node->profiles->device_refs->target def get_target_device(self, target_uuid, node_name): node_db = self.lookup_name(node_name) if not node_db: return None prof_list = node_db.get_refs('profile') for prof_uuid in prof_list: prof_db = node_db.lookup(prof_uuid) ref_list = prof_db.get_all_refs() for ref in ref_list: dev = self.lookup(ref[1]) if dev and dev.get_first_ref('target') == target_uuid: return ref[1] return None def get_active_target(self): target_uuid = self.getUUID() target_name = self.getName() node_name = config.select(target_name) if node_name: tgt_dev_uuid = self.get_target_device(target_uuid, node_name) else: tgt_dev_uuid = self.get_first_ref('active') return tgt_dev_uuid # get all network uuids for this node def get_networks(self): ret = [] prof_list = self.get_refs('profile') for prof_uuid in prof_list: prof_db = self.lookup(prof_uuid) net_list = prof_db.get_refs('network') #debug("get_networks():", prof_uuid, net_list) for net_uuid in net_list: ret.append(net_uuid) return ret class LustreDB_XML(LustreDB): def __init__(self, dom, root_node): # init xmlfile self.dom_node = dom self.root_node = root_node def xmltext(self, dom_node, tag): list = dom_node.getElementsByTagName(tag) if len(list) > 0: dom_node = list[0] dom_node.normalize() if dom_node.firstChild: txt = string.strip(dom_node.firstChild.data) if txt: return txt def xmlattr(self, dom_node, attr): return dom_node.getAttribute(attr) def _get_val(self, tag): """a value could be an attribute of the current node or the text value in a child node""" ret = self.xmlattr(self.dom_node, tag) if not ret: ret = self.xmltext(self.dom_node, tag) return ret def _get_class(self): return self.dom_node.nodeName # # [(ref_class, ref_uuid),] def _get_all_refs(self): list = [] for n in self.dom_node.childNodes: if n.nodeType == n.ELEMENT_NODE: ref_uuid = self.xml_get_ref(n) ref_class = n.nodeName list.append((ref_class, ref_uuid)) list.sort() return list def _get_refs(self, tag): """ Get all the refs of type TAG. Returns list of uuids. """ uuids = [] refname = '%s_ref' % tag reflist = self.dom_node.getElementsByTagName(refname) for r in reflist: uuids.append(self.xml_get_ref(r)) return uuids def xmllookup_by_uuid(self, dom_node, uuid): for n in dom_node.childNodes: if n.nodeType == n.ELEMENT_NODE: if self.xml_get_uuid(n) == uuid: return n else: n = self.xmllookup_by_uuid(n, uuid) if n: return n return None def _lookup_by_uuid(self, uuid): dom = self. xmllookup_by_uuid(self.root_node, uuid) if dom: return LustreDB_XML(dom, self.root_node) def xmllookup_by_name(self, dom_node, name): for n in dom_node.childNodes: if n.nodeType == n.ELEMENT_NODE: if self.xml_get_name(n) == name: return n else: n = self.xmllookup_by_name(n, name) if n: return n return None def _lookup_by_name(self, name, class_name): dom = self.xmllookup_by_name(self.root_node, name) if dom: return LustreDB_XML(dom, self.root_node) def xmllookup_by_class(self, dom_node, class_name): return dom_node.getElementsByTagName(class_name) def _lookup_by_class(self, class_name): ret = [] domlist = self.xmllookup_by_class(self.root_node, class_name) for node in domlist: ret.append(LustreDB_XML(node, self.root_node)) return ret def xml_get_name(self, n): return n.getAttribute('name') def getName(self): return self.xml_get_name(self.dom_node) def xml_get_ref(self, n): return n.getAttribute('uuidref') def xml_get_uuid(self, dom_node): return dom_node.getAttribute('uuid') def getUUID(self): return self.xml_get_uuid(self.dom_node) def get_routes(self, type, gw): """ Return the routes as a list of tuples of the form: [(type, gw, lo, hi),]""" res = [] tbl = self.dom_node.getElementsByTagName('routetbl') for t in tbl: routes = t.getElementsByTagName('route') for r in routes: net_type = self.xmlattr(r, 'type') if type != net_type: lo = self.xmlattr(r, 'lo') hi = self.xmlattr(r, 'hi') res.append((type, gw, lo, hi)) return res def get_route_tbl(self): ret = [] for r in self.dom_node.getElementsByTagName('route'): net_type = self.xmlattr(r, 'type') gw = self.xmlattr(r, 'gw') lo = self.xmlattr(r, 'lo') hi = self.xmlattr(r, 'hi') ret.append((net_type, gw, lo, hi)) return ret # ================================================================ # LDAP Support class LustreDB_LDAP(LustreDB): def __init__(self, name, attrs, base = "fs=lustre", parent = None, url = "ldap://localhost", user = "cn=Manager, fs=lustre", pw = "secret" ): self._name = name self._attrs = attrs self._base = base self._parent = parent self._url = url self._user = user self._pw = pw if parent: self.l = parent.l self._base = parent._base else: self.open() def open(self): import ldap try: self.l = ldap.initialize(self._url) # Set LDAP protocol version used self.l.protocol_version=ldap.VERSION3 # user and pw only needed if modifying db self.l.bind_s("", "", ldap.AUTH_SIMPLE); except ldap.LDAPError, e: panic(e) # FIXME, do something useful here def close(self): self.l.unbind_s() def ldap_search(self, filter): """Return list of uuids matching the filter.""" import ldap dn = self._base ret = [] uuids = [] try: for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL, filter, ["uuid"]): for v in attrs['uuid']: uuids.append(v) except ldap.NO_SUCH_OBJECT, e: pass except ldap.LDAPError, e: print e # FIXME: die here? if len(uuids) > 0: for uuid in uuids: ret.append(self._lookup_by_uuid(uuid)) return ret def _lookup_by_name(self, name, class_name): list = self.ldap_search("lustreName=%s" %(name)) if len(list) == 1: return list[0] return [] def _lookup_by_class(self, class_name): return self.ldap_search("objectclass=%s" %(string.upper(class_name))) def _lookup_by_uuid(self, uuid): import ldap dn = "uuid=%s,%s" % (uuid, self._base) ret = None try: for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE, "objectclass=*"): ret = LustreDB_LDAP(name, attrs, parent = self) except ldap.NO_SUCH_OBJECT, e: debug("NO_SUCH_OBJECT:", uuid) pass # just return empty list except ldap.LDAPError, e: print e # FIXME: die here? return ret def _get_val(self, k): ret = None if self._attrs.has_key(k): v = self._attrs[k] if type(v) == types.ListType: ret = str(v[0]) else: ret = str(v) return ret def _get_class(self): return string.lower(self._attrs['objectClass'][0]) # # [(ref_class, ref_uuid),] def _get_all_refs(self): list = [] for k in self._attrs.keys(): if re.search('.*Ref', k): for uuid in self._attrs[k]: list.append((k, uuid)) return list def _get_refs(self, tag): """ Get all the refs of type TAG. Returns list of uuids. """ uuids = [] refname = '%sRef' % tag if self._attrs.has_key(refname): return self._attrs[refname] return [] def getName(self): return self._get_val('lustreName') def getUUID(self): return self._get_val('uuid') def get_route_tbl(self): return [] ############################################################ # MDC UUID hack - # FIXME: clean this mess up! # # OSC is no longer in the xml, so we have to fake it. # this is getting ugly and begging for another refactoring def get_osc(ost_db, owner): osc = OSC(ost_db, owner) return osc def get_mdc(db, owner, mds_uuid): mds_db = db.lookup(mds_uuid); if not mds_db: panic("no mds:", mds_uuid) mdc = MDC(mds_db, owner) return mdc def prepare_mdc(db, owner, mds_uuid): mdc = get_mdc(db, owner, mds_uuid) mdc.prepare() return mdc.uuid def cleanup_mdc(db, owner, mds_uuid): mdc = get_mdc(db, owner, mds_uuid) mdc.cleanup() ############################################################ # routing ("rooting") # routes = [] local_node = [] router_flag = 0 def add_local_interfaces(node_db): global local_node for netuuid in node_db.get_networks(): net = node_db.lookup(netuuid) srv = Network(net) debug("add_local", netuuid) local_node.append((srv.net_type, srv.nid)) if acceptors.has_key(srv.port): panic("duplicate port:", srv.port) if srv.net_type in ('tcp', 'toe'): acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type, srv.send_mem, srv.recv_mem, srv.irq_affinity, srv.nid_exchange) def node_needs_router(): return router_flag def init_route_config(lustre): """ Scan the lustre config looking for routers. Build list of routes. """ global routes, router_flag routes = [] list = lustre.lookup_class('node') for node_db in list: if node_db.get_val_int('router', 0): router_flag = 1 #debug("init_route_config: found router", node_db.getName()) for (local_type, local_nid) in local_node: #debug("init_route_config:", local_type, local_nid) gw = None for netuuid in node_db.get_networks(): db = node_db.lookup(netuuid) if local_type == db.get_val('nettype'): gw = db.get_val('nid') break #debug("init_route_config: gw is", gw) if not gw: continue for route in node_db.get_routes(local_type, gw): routes.append(route) debug("init_route_config routes:", routes) def local_net(srv_list): global local_node for iface in local_node: for srv in srv_list: #debug("local_net a:", srv.net_type, "b:", iface[0]) if srv.net_type == iface[0]: return srv return None def local_net_type(net_type): global local_node for iface in local_node: if net_type == iface[0]: return 1 return 0 def find_route(srv_list): global local_node, routes frm_type = local_node[0][0] for srv in srv_list: #debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type) to_type = srv.net_type to = srv.hostaddr #debug ('looking for route to', to_type, to) for r in routes: #debug("find_route: ", r) if r[2] == to: return srv, r return None,None ############################################################ # lconf level logic # Start a service. def newService(db): type = db.get_class() debug('Service:', type, db.getName(), db.getUUID()) n = None if type == 'ldlm': n = LDLM(db) elif type == 'ptlrpc': n = PTLRPC(db) elif type == 'lov': n = LOV(db) elif type == 'network': n = Network(db) elif type == 'routetbl': n = Router(db) elif type == 'osd': n = OSD(db) elif type == 'cobd': n = COBD(db) elif type == 'mdsdev': n = MDSDEV(db) elif type == 'mountpoint': n = Mountpoint(db) elif type == 'echoclient': n = ECHO_CLIENT(db) else: panic ("unknown service type:", type) return n # # Prepare the system to run lustre using a particular profile # in a the configuration. # * load & the modules # * setup networking for the current node # * make sure partitions are in place and prepared # * initialize devices with lctl # Levels is important, and needs to be enforced. def for_each_profile(db, prof_list, operation): for prof_uuid in prof_list: prof_db = db.lookup(prof_uuid) if not prof_db: panic("profile:", profile, "not found.") services = prof_db.getServices() operation(services) def doSetup(services): if config.nosetup(): return for s in services: n = newService(s[1]) n.prepare() def doModules(services): if config.nomod(): return for s in services: n = newService(s[1]) n.load_module() def doCleanup(services): if config.nosetup(): return services.reverse() for s in services: n = newService(s[1]) n.cleanup() def doUnloadModules(services): if config.nomod(): return services.reverse() for s in services: n = newService(s[1]) n.cleanup_module() # # Load profile for def doHost(lustreDB, hosts): global routes global router_flag node_db = None for h in hosts: node_db = lustreDB.lookup_name(h, 'node') if node_db: break if not node_db: print 'No host entry found.' return router_flag = node_db.get_val_int('router', 0) recovery_upcall = node_db.get_val('recovery_upcall', '') timeout = node_db.get_val_int('timeout', 0) add_local_interfaces(node_db) if not router_flag: init_route_config(lustreDB) # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. prof_list = node_db.get_refs('profile') if config.cleanup(): if config.force(): # the command line can override this value timeout = 5 # ugly hack, only need to run lctl commands for --dump if config.lctl_dump(): for_each_profile(node_db, prof_list, doCleanup) return sys_set_timeout(timeout) sys_set_recovery_upcall(recovery_upcall) for_each_profile(node_db, prof_list, doCleanup) for_each_profile(node_db, prof_list, doUnloadModules) else: # ugly hack, only need to run lctl commands for --dump if config.lctl_dump(): for_each_profile(node_db, prof_list, doSetup) return for_each_profile(node_db, prof_list, doModules) sys_set_debug_path() script = config.gdb_script() run(lctl.lctl, ' modules >', script) if config.gdb(): log ("The GDB module script is in", script) # pause, so user has time to break and # load the script time.sleep(5) sys_set_timeout(timeout) sys_set_recovery_upcall(recovery_upcall) for_each_profile(node_db, prof_list, doSetup) ############################################################ # Command line processing # def parse_cmdline(argv): short_opts = "hdnvf" long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb", "portals=", "makeldiff", "cleanup", "noexec", "help", "node=", "nomod", "nosetup", "dump=", "force", "minlevel=", "maxlevel=", "timeout=", "recovery_upcall=", "ldapurl=", "config=", "select=", "lctl_dump="] opts = [] args = [] try: opts, args = getopt.getopt(argv, short_opts, long_opts) except getopt.error: print "invalid opt" usage() for o, a in opts: if o in ("-h", "--help"): usage() if o in ("-d","--cleanup"): config.cleanup(1) if o in ("-v", "--verbose"): config.verbose(1) if o in ("-n", "--noexec"): config.noexec(1) if o == "--portals": config.portals_dir(a) if o == "--lustre": config.lustre_dir(a) if o == "--reformat": config.reformat(1) if o == "--node": config.node(a) if o == "--gdb": config.gdb(1) if o == "--nomod": config.nomod(1) if o == "--nosetup": config.nosetup(1) if o == "--dump": config.dump_file(a) if o in ("-f", "--force"): config.force(1) if o == "--minlevel": config.minlevel(a) if o == "--maxlevel": config.maxlevel(a) if o == "--timeout": config.timeout(a) if o == "--recovery_upcall": config.recovery_upcall(a) if o == "--ldapurl": config.ldapurl(a) if o == "--config": config.config_name(a) if o == "--select": config.init_select(a) if o == "--lctl_dump": config.lctl_dump(a) return args def fetch(url): import urllib data = "" try: s = urllib.urlopen(url) data = s.read() except: usage() return data def setupModulePath(cmd, portals_dir = PORTALS_DIR): base = os.path.dirname(cmd) if os.access(base+"/Makefile", os.R_OK): if not config.lustre_dir(): config.lustre_dir(os.path.join(base, "..")) # normalize the portals dir, using command line arg if set if config.portals_dir(): portals_dir = config.portals_dir() dir = os.path.join(config.lustre_dir(), portals_dir) config.portals_dir(dir) elif config.lustre_dir() and config.portals_dir(): # production mode # if --lustre and --portals, normalize portals # can ignore POTRALS_DIR here, since it is probly useless here dir = config.portals_dir() dir = os.path.join(config.lustre_dir(), dir) config.portals_dir(dir) def sysctl(path, val): if config.noexec(): return try: fp = open(os.path.join('/proc/sys', path), 'w') fp.write(str(val)) fp.close() except IOError, e: print e def sys_set_debug_path(): debug("debug path: ", config.debug_path()) sysctl('portals/debug_path', config.debug_path()) def sys_set_recovery_upcall(upcall): # the command overrides the value in the node config if config.recovery_upcall(): upcall = config.recovery_upcall() if upcall: debug("setting recovery_upcall:", upcall) sysctl('lustre/recovery_upcall', upcall) def sys_set_timeout(timeout): # the command overrides the value in the node config if config.timeout() > 0: timeout = config.timeout() if timeout > 0: debug("setting timeout:", timeout) sysctl('lustre/timeout', timeout) def sys_set_ptldebug(ptldebug): # the command overrides the value in the node config if config.ptldebug(): ptldebug = config.ptldebug() sysctl('portals/debug', ptldebug) def sys_set_netmem_max(path, max): debug("setting", path, "to at least", max) if config.noexec(): return fp = open(path) str = fp.readline() fp.close cur = int(str) if max > cur: fp = open(path, 'w') fp.write('%d\n' %(max)) fp.close() def sys_make_devices(): if not os.access('/dev/portals', os.R_OK): run('mknod /dev/portals c 10 240') if not os.access('/dev/obd', os.R_OK): run('mknod /dev/obd c 10 241') # Add dir to the global PATH, if not already there. def add_to_path(new_dir): syspath = string.split(os.environ['PATH'], ':') if new_dir in syspath: return os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin') # ensure basic elements are in the system path def sanitise_path(): for dir in DEFAULT_PATH: add_to_path(dir) # Initialize or shutdown lustre according to a configuration file # * prepare the system for lustre # * configure devices with lctl # Shutdown does steps in reverse # def main(): global lctl, MAXTCPBUF host = socket.gethostname() # the PRNG is normally seeded with time(), which is not so good for starting # time-synchronized clusters input = open('/dev/urandom', 'r') if not input: print 'Unable to open /dev/urandom!' sys.exit(1) seed = input.read(32) input.close() random.seed(seed) sanitise_path() args = parse_cmdline(sys.argv[1:]) if len(args) > 0: if not os.access(args[0], os.R_OK): print 'File not found or readable:', args[0] sys.exit(1) try: dom = xml.dom.minidom.parse(args[0]) except Exception: panic("%s does not appear to be a config file." % (args[0])) sys.exit(1) # make sure to die here, even in debug mode. db = LustreDB_XML(dom.documentElement, dom.documentElement) elif config.ldapurl(): if not config.config_name(): panic("--ldapurl requires --config name") dn = "config=%s,fs=lustre" % (config.config_name()) db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl()) else: usage() node_list = [] if config.node(): node_list.append(config.node()) else: if len(host) > 0: node_list.append(host) node_list.append('localhost') debug("configuring for host: ", node_list) if len(host) > 0: config._debug_path = config._debug_path + '-' + host config._gdb_script = config._gdb_script + '-' + host setupModulePath(sys.argv[0]) lctl = LCTLInterface('lctl') if config.lctl_dump(): lctl.use_save_file(config.lctl_dump()) else: sys_make_devices() sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) doHost(db, node_list) if __name__ == "__main__": try: main() except LconfError, e: print e except CommandError, e: e.dump() sys.exit(e.rc) if first_cleanup_error: sys.exit(first_cleanup_error)