#!/usr/bin/env python # # Copyright (C) 2002-2003 Cluster File Systems, Inc. # Authors: Robert Read # Mike Shaver # This file is part of Lustre, http://www.lustre.org. # # Lustre is free software; you can redistribute it and/or # modify it under the terms of version 2 of the GNU General Public # License as published by the Free Software Foundation. # # Lustre is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Lustre; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # lconf - lustre configuration tool # # lconf is the main driver script for starting and stopping # lustre filesystem services. # # Based in part on the XML obdctl modifications done by Brian Behlendorf import sys, getopt, types import string, os, stat, popen2, socket, time, random, fcntl, select import re, exceptions, signal, traceback import xml.dom.minidom if sys.version[0] == '1': from FCNTL import F_GETFL, F_SETFL else: from fcntl import F_GETFL, F_SETFL PYMOD_DIR = "/usr/lib/lustre/python" def development_mode(): base = os.path.dirname(sys.argv[0]) if os.access(base+"/Makefile", os.R_OK): return 1 return 0 if development_mode(): sys.path.append('../utils') else: sys.path.append(PYMOD_DIR) import Lustre # Global parameters MAXTCPBUF = 16777216 DEFAULT_TCPBUF = 8388608 DEFAULT_PORT = 988 # # Maximum number of devices to search for. # (the /dev/loop* nodes need to be created beforehand) MAX_LOOP_DEVICES = 256 PORTALS_DIR = '../portals' # Needed to call lconf --record CONFIG_FILE = "" # Please keep these in sync with the values in portals/kp30.h ptldebug_names = { "trace" : (1 << 0), "inode" : (1 << 1), "super" : (1 << 2), "ext2" : (1 << 3), "malloc" : (1 << 4), "cache" : (1 << 5), "info" : (1 << 6), "ioctl" : (1 << 7), "blocks" : (1 << 8), "net" : (1 << 9), "warning" : (1 << 10), "buffs" : (1 << 11), "other" : (1 << 12), "dentry" : (1 << 13), "portals" : (1 << 14), "page" : (1 << 15), "dlmtrace" : (1 << 16), "error" : (1 << 17), "emerg" : (1 << 18), "ha" : (1 << 19), "rpctrace" : (1 << 20), "vfstrace" : (1 << 21), "reada" : (1 << 22), "mmap" : (1 << 23), "config" : (1 << 24), "console" : (1 << 25), "quota" : (1 << 26), "sec" : (1 << 27), } subsystem_names = { "undefined" : (1 << 0), "mdc" : (1 << 1), "mds" : (1 << 2), "osc" : (1 << 3), "ost" : (1 << 4), "class" : (1 << 5), "log" : (1 << 6), "llite" : (1 << 7), "rpc" : (1 << 8), "mgmt" : (1 << 9), "portals" : (1 << 10), "nal" : (1 << 11), "pinger" : (1 << 12), "filter" : (1 << 13), "ptlbd" : (1 << 14), "echo" : (1 << 15), "ldlm" : (1 << 16), "lov" : (1 << 17), "ptlrouter" : (1 << 18), "cobd" : (1 << 19), "sm" : (1 << 20), "asobd" : (1 << 21), "confobd" : (1 << 22), "lmv" : (1 << 23), "cmobd" : (1 << 24), "sec" : (1 << 25), } first_cleanup_error = 0 def cleanup_error(rc): global first_cleanup_error if not first_cleanup_error: first_cleanup_error = rc # ============================================================ # debugging and error funcs def fixme(msg = "this feature"): raise Lustre.LconfError, msg + ' not implemented yet.' def panic(*args): msg = string.join(map(str,args)) if not config.noexec: raise Lustre.LconfError(msg) else: print "! " + msg def log(*args): msg = string.join(map(str,args)) print msg def logall(msgs): for s in msgs: print string.strip(s) def debug(*args): if config.verbose: msg = string.join(map(str,args)) print msg # ack, python's builtin int() does not support '0x123' syntax. # eval can do it, although what a hack! def my_int(s): try: if s[0:2] == '0x': return eval(s, {}, {}) else: return int(s) except SyntaxError, e: raise ValueError("not a number") except NameError, e: raise ValueError("not a number") # ============================================================ # locally defined exceptions class CommandError (exceptions.Exception): def __init__(self, cmd_name, cmd_err, rc=None): self.cmd_name = cmd_name self.cmd_err = cmd_err self.rc = rc def dump(self): import types if type(self.cmd_err) == types.StringType: if self.rc: print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err) else: print "! %s: %s" % (self.cmd_name, self.cmd_err) elif type(self.cmd_err) == types.ListType: if self.rc: print "! %s (error %d):" % (self.cmd_name, self.rc) else: print "! %s:" % (self.cmd_name) for s in self.cmd_err: print "> %s" %(string.strip(s)) else: print self.cmd_err # ============================================================ # handle daemons, like the acceptor class DaemonHandler: """ Manage starting and stopping a daemon. Assumes daemon manages it's own pid file. """ def __init__(self, cmd): self.command = cmd self.path ="" def start(self): if self.running(): log(self.command, "already running.") if not self.path: self.path = find_prog(self.command) if not self.path: panic(self.command, "not found.") ret, out = runcmd(self.path +' '+ self.command_line()) if ret: raise CommandError(self.path, out, ret) def stop(self): if self.running(): pid = self.read_pidfile() try: if pid != 1: log ("killing process", pid) os.kill(pid, 15) else: log("was unable to find pid of " + self.command) #time.sleep(1) # let daemon die except OSError, e: log("unable to kill", self.command, e) if self.running(): log("unable to kill", self.command) def running(self): pid = self.read_pidfile() if pid: try: if pid != 1: os.kill(pid, 0) else: log("was unable to find pid of " + self.command) except OSError: self.clean_pidfile() else: return 1 return 0 def read_pidfile(self): try: fp = open(self.pidfile(), 'r') val = fp.read() if val == '': val = '1' pid = int(val) fp.close() return pid except IOError: return 0 def clean_pidfile(self): """ Remove a stale pidfile """ log("removing stale pidfile:", self.pidfile()) try: os.unlink(self.pidfile()) except OSError, e: log(self.pidfile(), e) class AcceptorHandler(DaemonHandler): def __init__(self, port, net_type): DaemonHandler.__init__(self, "acceptor") self.port = port self.flags = '' def pidfile(self): return "/var/run/%s-%d.pid" % (self.command, self.port) def command_line(self): return string.join(map(str,(self.flags, self.port))) acceptors = {} # start the acceptors def run_acceptors(): if config.lctl_dump or config.record: return for port in acceptors.keys(): daemon = acceptors[port] if not daemon.running(): daemon.start() def run_one_acceptor(port): if config.lctl_dump or config.record: return if acceptors.has_key(port): daemon = acceptors[port] if not daemon.running(): daemon.start() else: panic("run_one_acceptor: No acceptor defined for port:", port) def stop_acceptor(port): if acceptors.has_key(port): daemon = acceptors[port] if daemon.running(): daemon.stop() # ============================================================ # handle lctl interface class LCTLInterface: """ Manage communication with lctl """ def __init__(self, cmd): """ Initialize close by finding the lctl binary. """ self.lctl = find_prog(cmd) self.save_file = '' self.record_device = '' if not self.lctl: if config.noexec: debug('! lctl not found') self.lctl = 'lctl' else: raise CommandError('lctl', "unable to find lctl binary.") def use_save_file(self, file): self.save_file = file def record(self, dev_name, logname): log("Recording log", logname, "on", dev_name) self.record_device = dev_name self.record_log = logname def end_record(self): log("End recording log", self.record_log, "on", self.record_device) self.record_device = None self.record_log = None def set_nonblock(self, fd): fl = fcntl.fcntl(fd, F_GETFL) fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY) def run(self, cmds): """ run lctl the cmds are written to stdin of lctl lctl doesn't return errors when run in script mode, so stderr is checked should modify command line to accept multiple commands, or create complex command line options """ cmd_line = self.lctl if self.save_file: cmds = '\n dump ' + self.save_file + '\n' + cmds elif self.record_device: cmds = """ device $%s record %s %s""" % (self.record_device, self.record_log, cmds) debug("+", cmd_line, cmds) if config.noexec: return (0, []) child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command child.tochild.write(cmds + "\n") child.tochild.close() # print "LCTL:", cmds # From "Python Cookbook" from O'Reilly outfile = child.fromchild outfd = outfile.fileno() self.set_nonblock(outfd) errfile = child.childerr errfd = errfile.fileno() self.set_nonblock(errfd) outdata = errdata = '' outeof = erreof = 0 while 1: ready = select.select([outfd,errfd],[],[]) # Wait for input if outfd in ready[0]: outchunk = outfile.read() if outchunk == '': outeof = 1 outdata = outdata + outchunk if errfd in ready[0]: errchunk = errfile.read() if errchunk == '': erreof = 1 errdata = errdata + errchunk if outeof and erreof: break # end of "borrowed" code ret = child.wait() if os.WIFEXITED(ret): rc = os.WEXITSTATUS(ret) else: rc = 0 if rc or len(errdata): raise CommandError(self.lctl, errdata, rc) return rc, outdata def runcmd(self, *args): """ run lctl using the command line """ cmd = string.join(map(str,args)) debug("+", self.lctl, cmd) rc, out = run(self.lctl, cmd) if rc: raise CommandError(self.lctl, out, rc) return rc, out def clear_log(self, dev, log): """ clear an existing log """ cmds = """ device $%s probe clear_log %s quit """ % (dev, log) self.run(cmds) def root_squash(self, name, uid, nid): cmds = """ device $%s root_squash %s %s quit""" % (name, uid, nid) self.run(cmds) def network(self, net, nid): """ set mynid """ cmds = """ network %s mynid %s quit """ % (net, nid) self.run(cmds) # add an interface def add_interface(self, net, ip, netmask = ""): """ add an interface """ cmds = """ network %s add_interface %s %s quit """ % (net, ip, netmask) self.run(cmds) # delete an interface def del_interface(self, net, ip): """ delete an interface """ cmds = """ network %s del_interface %s quit """ % (net, ip) self.run(cmds) # create a new connection def add_uuid(self, net_type, uuid, nid): cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type) self.run(cmds) def add_peer(self, net_type, nid, hostaddr, port): if net_type in ('tcp','openib','ra') and not config.lctl_dump: cmds = """ network %s add_peer %s %s %d quit""" % (net_type, nid, hostaddr, port ) self.run(cmds) elif net_type in ('iib',) and not config.lctl_dump: cmds = """ network %s add_peer %s quit""" % (net_type, nid ) self.run(cmds) elif net_type in ('vib',) and not config.lctl_dump: cmds = """ network %s add_peer %s %s quit""" % (net_type, nid, hostaddr ) self.run(cmds) def connect(self, srv): self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid) if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump: if srv.hostaddr[0]: hostaddr = string.split(srv.hostaddr[0], '/')[0] self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port) # Recover a device def recover(self, dev_name, new_conn): cmds = """ device $%s recover %s""" %(dev_name, new_conn) self.run(cmds) # add a route to a range def add_route(self, net, gw, lo, hi): cmds = """ network %s add_route %s %s %s quit """ % (net, gw, lo, hi) try: self.run(cmds) except CommandError, e: log ("ignore: ") e.dump() def del_route(self, net, gw, lo, hi): cmds = """ ignore_errors network %s del_route %s %s %s quit """ % (net, gw, lo, hi) self.run(cmds) # add a route to a host def add_route_host(self, net, uuid, gw, tgt): self.add_uuid(net, uuid, tgt) cmds = """ network %s add_route %s %s quit """ % (net, gw, tgt) try: self.run(cmds) except CommandError, e: log ("ignore: ") e.dump() # add a route to a range def del_route_host(self, net, uuid, gw, tgt): self.del_uuid(uuid) cmds = """ ignore_errors network %s del_route %s %s quit """ % (net, gw, tgt) self.run(cmds) def del_peer(self, net_type, nid, hostaddr): if net_type in ('tcp',) and not config.lctl_dump: cmds = """ ignore_errors network %s del_peer %s %s single_share quit""" % (net_type, nid, hostaddr) self.run(cmds) elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump: cmds = """ ignore_errors network %s del_peer %s single_share quit""" % (net_type, nid) self.run(cmds) # disconnect one connection def disconnect(self, srv): self.del_uuid(srv.nid_uuid) if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump: if srv.hostaddr[0]: hostaddr = string.split(srv.hostaddr[0], '/')[0] self.del_peer(srv.net_type, srv.nid, hostaddr) def del_uuid(self, uuid): cmds = """ ignore_errors del_uuid %s quit""" % (uuid,) self.run(cmds) # disconnect all def disconnectAll(self, net): cmds = """ ignore_errors network %s disconnect quit""" % (net) self.run(cmds) def attach(self, type, name, uuid): cmds = """ attach %s %s %s quit""" % (type, name, uuid) self.run(cmds) def detach(self, name): cmds = """ cfg_device %s detach quit""" % (name) self.run(cmds) def set_security(self, name, key, value): cmds = """ cfg_device %s set_security %s %s quit""" % (name, key, value) self.run(cmds) def setup(self, name, setup = ""): cmds = """ cfg_device %s setup %s quit""" % (name, setup) self.run(cmds) def add_conn(self, name, conn_uuid): cmds = """ cfg_device %s add_conn %s quit""" % (name, conn_uuid) self.run(cmds) def start(self, name, conf_name): cmds = """ device $%s start %s quit""" % (name, conf_name) self.run(cmds) # create a new device with lctl def newdev(self, type, name, uuid, setup = ""): self.attach(type, name, uuid); try: self.setup(name, setup) except CommandError, e: self.cleanup(name, uuid, 0) raise e # cleanup a device def cleanup(self, name, uuid, force, failover = 0): if failover: force = 1 cmds = """ ignore_errors cfg_device $%s cleanup %s %s detach quit""" % (name, ('', 'force')[force], ('', 'failover')[failover]) self.run(cmds) # create an lov def lov_setup(self, name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist = None): cmds = """ attach lov %s %s lov_setup %s %d %d %d %s %s quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist) self.run(cmds) # add an OBD to a LOV def lov_add_obd(self, name, uuid, obd_uuid, index, gen): cmds = """ lov_modify_tgts add %s %s %s %s quit""" % (name, obd_uuid, index, gen) self.run(cmds) # create an lmv def lmv_setup(self, name, uuid, desc_uuid, devlist): cmds = """ attach lmv %s %s lmv_setup %s %s quit""" % (name, uuid, desc_uuid, devlist) self.run(cmds) # delete an OBD from a LOV def lov_del_obd(self, name, uuid, obd_uuid, index, gen): cmds = """ lov_modify_tgts del %s %s %s %s quit""" % (name, obd_uuid, index, gen) self.run(cmds) # deactivate an OBD def deactivate(self, name): cmds = """ device $%s deactivate quit""" % (name) self.run(cmds) # dump the log file def dump(self, dump_file): cmds = """ debug_kernel %s 1 quit""" % (dump_file) self.run(cmds) # get list of devices def device_list(self): devices = '/proc/fs/lustre/devices' ret = [] if os.access(devices, os.R_OK): try: fp = open(devices, 'r') ret = fp.readlines() fp.close() except IOError, e: log(e) return ret # get lustre version def lustre_version(self): rc, out = self.runcmd('version') return out # dump mount options def mount_option(self, profile, osc, mdc): cmds = """ mount_option %s %s %s quit""" % (profile, osc, mdc) self.run(cmds) # delete mount options def del_mount_option(self, profile): cmds = """ del_mount_option %s quit""" % (profile,) self.run(cmds) def set_timeout(self, timeout): cmds = """ set_timeout %s quit""" % (timeout,) self.run(cmds) def set_lustre_upcall(self, upcall): cmds = """ set_lustre_upcall %s quit""" % (upcall,) self.run(cmds) # ============================================================ # Various system-level functions # (ideally moved to their own module) # Run a command and return the output and status. # stderr is sent to /dev/null, could use popen3 to # save it if necessary def runcmd(cmd): debug ("+", cmd) if config.noexec: return (0, []) f = os.popen(cmd + ' 2>&1') out = f.readlines() ret = f.close() if ret: ret = ret >> 8 else: ret = 0 return (ret, out) def run(*args): cmd = string.join(map(str,args)) return runcmd(cmd) # Run a command in the background. def run_daemon(*args): cmd = string.join(map(str,args)) debug ("+", cmd) if config.noexec: return 0 f = os.popen(cmd + ' 2>&1') ret = f.close() if ret: ret = ret >> 8 else: ret = 0 return ret # Determine full path to use for an external command # searches dirname(argv[0]) first, then PATH def find_prog(cmd): syspath = string.split(os.environ['PATH'], ':') cmdpath = os.path.dirname(sys.argv[0]) syspath.insert(0, cmdpath); if config.portals: syspath.insert(0, os.path.join(config.portals, 'utils/')) for d in syspath: prog = os.path.join(d,cmd) if os.access(prog, os.X_OK): return prog return '' # Recursively look for file starting at base dir def do_find_file(base, mod): fullname = os.path.join(base, mod) if os.access(fullname, os.R_OK): return fullname for d in os.listdir(base): dir = os.path.join(base,d) if os.path.isdir(dir): module = do_find_file(dir, mod) if module: return module # is the path a block device? def is_block(path): s = () try: s = os.stat(path) except OSError: return 0 return stat.S_ISBLK(s[stat.ST_MODE]) # find the journal device from mkfs options def jdev(opts): if opts == None: return '' x=string.split(opts) i=0 while i < len(x) - 1: if x[i] == '-J' and x[i+1].startswith('device='): str=x[i+1] return str[7:] i=i+1 return '' # build fs according to type # fixme: dangerous def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): block_cnt = '' jopt = '' iopt = '' if devsize: if devsize < 8000: panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"% (dev, devsize)) # devsize is in 1k, and fs block count is in 4k block_cnt = devsize/4 if fstype in ('ext3', 'extN', 'ldiskfs'): # ext3 journal size is in megabytes # but don't set jsize if mkfsoptions indicates a separate journal device if jsize == 0 and jdev(mkfsoptions) == '': if devsize == 0: if not is_block(dev): ret, out = runcmd("ls -l %s" %dev) devsize = int(string.split(out[0])[4]) / 1024 else: # sfdisk works for symlink, hardlink, and realdev ret, out = runcmd("sfdisk -s %s" %dev) if not ret: devsize = int(out[0]) else: # sfdisk -s will fail for too large block device, # then, read the size of partition from /proc/partitions # get the realpath of the device # it may be the real device, such as /dev/hda7 # or the hardlink created via mknod for a device if 'realpath' in dir(os.path): real_dev = os.path.realpath(dev) else: real_dev = dev link_count = 0 while os.path.islink(real_dev) and (link_count < 20): link_count = link_count + 1 dev_link = os.readlink(real_dev) if os.path.isabs(dev_link): real_dev = dev_link else: real_dev = os.path.join(os.path.dirname(real_dev), dev_link) if link_count > 19: panic("Entountered too many symbolic links resolving block device:", dev) # get the major and minor number of the realpath via ls # it seems python(os.stat) does not return # the st_rdev member of the stat structure ret, out = runcmd("ls -l %s" %real_dev) major = string.split(string.split(out[0])[4], ",")[0] minor = string.split(out[0])[5] # get the devsize from /proc/partitions with the major and minor number ret, out = runcmd("cat /proc/partitions") for line in out: if len(line) > 1: if string.split(line)[0] == major and string.split(line)[1] == minor: devsize = int(string.split(line)[2]) break if devsize > 1024 * 1024: jsize = ((devsize / 102400) * 4) if jsize > 400: jsize = 400 if jsize: jopt = "-J size=%d" %(jsize,) if isize: iopt = "-I %d" %(isize,) mkfs = 'mkfs.ext2 -j -b 4096 ' if not isblock or config.force: mkfs = mkfs + ' -F ' if jdev(mkfsoptions) != '': jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev ' if config.force: jmkfs = jmkfs + '-F ' jmkfs = jmkfs + jdev(mkfsoptions) (ret, out) = run (jmkfs) if ret: panic("Unable format journal device:", jdev(mkfsoptions), string.join(out)) elif fstype == 'reiserfs': # reiserfs journal size is in blocks if jsize: jopt = "--journal_size %d" %(jsize,) mkfs = 'mkreiserfs -ff' else: panic('unsupported fs type: ', fstype) if config.mkfsoptions != None: mkfs = mkfs + ' ' + config.mkfsoptions if mkfsoptions != None: mkfs = mkfs + ' ' + mkfsoptions (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt) if ret: panic("Unable to build fs:", dev, string.join(out)) # enable hash tree indexing on fsswe if fstype in ('ext3', 'extN', 'ldiskfs'): htree = 'echo "feature FEATURE_C5" | debugfs -w' (ret, out) = run (htree, dev) if ret: panic("Unable to enable htree:", dev) # some systems use /dev/loopN, some /dev/loop/N def loop_base(): import re loop = '/dev/loop' if not os.access(loop + str(0), os.R_OK): loop = loop + '/' if not os.access(loop + str(0), os.R_OK): panic ("can't access loop devices") return loop # find loop device assigned to the file def find_assigned_loop(file): loop = loop_base() for n in xrange(0, MAX_LOOP_DEVICES): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) if out and stat == 0: m = re.search(r'\((.*)\)', out[0]) if m and file == m.group(1): return dev return '' # find free loop device def find_free_loop(file): loop = loop_base() # find next free loop for n in xrange(0, MAX_LOOP_DEVICES): dev = loop + str(n) if os.access(dev, os.R_OK): (stat, out) = run('losetup', dev) if stat: return dev return '' # create file if necessary and assign the first free loop device def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat, autoformat, backfstype, backfile): if fstype == 'smfs': realfile = backfile realfstype = backfstype if is_block(backfile): if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'): mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0) return realfile else: realfile = file realfstype = fstype dev = find_assigned_loop(realfile) if dev: print 'WARNING: file', realfile, 'already mapped to', dev return dev if reformat or not os.access(realfile, os.R_OK | os.W_OK): (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile)) if ret: panic("Unable to create backing store:", realfile) mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0) dev = find_free_loop(realfile) if dev: print "attach " + realfile + " <-> " + dev run('losetup', dev, realfile) return dev print "out of loop devices" return '' # undo loop assignment def clean_loop(dev, fstype, backfstype, backdev): if fstype == 'smfs': realfile = backdev else: realfile = dev if not is_block(realfile): dev = find_assigned_loop(realfile) if dev: print "detach " + dev + " <-> " + realfile ret, out = run('losetup -d', dev) if ret: log('unable to clean loop device', dev, 'for file', realfile) logall(out) # finilizes passed device def clean_dev(dev, fstype, backfstype, backdev): if fstype == 'smfs' or not is_block(dev): clean_loop(dev, fstype, backfstype, backdev) # determine if dev is formatted as a filesystem def need_format(fstype, dev): # FIXME don't know how to implement this return 0 # initialize a block device if needed def block_dev(dev, size, fstype, reformat, autoformat, journal_size, inode_size, mkfsoptions, backfstype, backdev): if config.noexec: return dev if fstype == 'smfs' or not is_block(dev): dev = init_loop(dev, size, fstype, journal_size, inode_size, mkfsoptions, reformat, autoformat, backfstype, backdev) elif reformat or (need_format(fstype, dev) and autoformat == 'yes'): mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0) # else: # panic("device:", dev, # "not prepared, and autoformat is not set.\n", # "Rerun with --reformat option to format ALL filesystems") return dev def if2addr(iface): """lookup IP address for an interface""" rc, out = run("/sbin/ifconfig", iface) if rc or not out: return None addr = string.split(out[1])[1] ip = string.split(addr, ':')[1] return ip def def_mount_options(fstype, target): """returns deafult mount options for passed fstype and target (mds, ost)""" if fstype == 'ext3' or fstype == 'ldiskfs': mountfsoptions = "errors=remount-ro" if target == 'ost' and sys_get_branch() == '2.4': mountfsoptions = "%s,asyncdel" % (mountfsoptions) return mountfsoptions return "" def sys_get_elan_position_file(): procfiles = ["/proc/elan/device0/position", "/proc/qsnet/elan4/device0/position", "/proc/qsnet/elan3/device0/position"] for p in procfiles: if os.access(p, os.R_OK): return p return "" def sys_get_local_nid(net_type, wildcard, cluster_id): """Return the local nid.""" local = "" if sys_get_elan_position_file(): local = sys_get_local_address('elan', '*', cluster_id) else: local = sys_get_local_address(net_type, wildcard, cluster_id) return local def sys_get_local_address(net_type, wildcard, cluster_id): """Return the local address for the network type.""" local = "" if net_type in ('tcp','openib','iib','vib','ra'): if ':' in wildcard: iface, star = string.split(wildcard, ':') local = if2addr(iface) if not local: panic ("unable to determine ip for:", wildcard) else: host = socket.gethostname() local = socket.gethostbyname(host) elif net_type == 'elan': # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()' f = sys_get_elan_position_file() if not f: panic ("unable to determine local Elan ID") try: fp = open(f, 'r') lines = fp.readlines() fp.close() for l in lines: a = string.split(l) if a[0] == 'NodeId': elan_id = a[1] break try: nid = my_int(cluster_id) + my_int(elan_id) local = "%d" % (nid) except ValueError, e: local = elan_id except IOError, e: log(e) elif net_type == 'lo': fixme("automatic local address for loopback") elif net_type == 'gm': fixme("automatic local address for GM") return local def sys_get_branch(): """Returns kernel release""" try: fp = open('/proc/sys/kernel/osrelease') lines = fp.readlines() fp.close() for l in lines: version = string.split(l) a = string.split(version[0], '.') return a[0] + '.' + a[1] except IOError, e: log(e) return "" # XXX: instead of device_list, ask for $name and see what we get def is_prepared(name): """Return true if a device exists for the name""" if config.lctl_dump: return 0 if (config.noexec or config.record) and config.cleanup: return 1 try: # expect this format: # 1 UP ldlm ldlm ldlm_UUID 2 out = lctl.device_list() for s in out: if name == string.split(s)[3]: return 1 except CommandError, e: e.dump() return 0 def net_is_prepared(): """If the any device exists, then assume that all networking has been configured""" out = lctl.device_list() return len(out) > 0 def fs_is_mounted(path): """Return true if path is a mounted lustre filesystem""" try: fp = open('/proc/mounts') lines = fp.readlines() fp.close() for l in lines: a = string.split(l) if a[1] == path and a[2] == 'lustre_lite': return 1 except IOError, e: log(e) return 0 def kmod_find(src_dir, dev_dir, modname): modbase = src_dir +'/'+ dev_dir +'/'+ modname for modext in '.ko', '.o': module = modbase + modext try: if os.access(module, os.R_OK): return module except OSError: pass return None def kmod_info(modname): """Returns reference count for passed module name.""" try: fp = open('/proc/modules') lines = fp.readlines() fp.close() # please forgive my tired fingers for this one ret = filter(lambda word, mod = modname: word[0] == mod, map(lambda line: string.split(line), lines)) if not ret: return '' return ret[0] except Exception, e: return 0 class kmod: """Presents kernel module""" def __init__(self, src_dir, dev_dir, name): self.src_dir = src_dir self.dev_dir = dev_dir self.name = name # FIXME we ignore the failure of loading gss module, because we might # don't need it at all. def load(self): """Load module""" log ('loading module:', self.name, 'srcdir', self.src_dir, 'devdir', self.dev_dir) if self.src_dir: module = kmod_find(self.src_dir, self.dev_dir, self.name) if not module and self.name != 'ptlrpcs_gss': panic('module not found:', self.name) (rc, out) = run('/sbin/insmod', module) if rc: if self.name == 'ptlrpcs_gss': print "Warning: not support gss security!" else: raise CommandError('insmod', out, rc) else: (rc, out) = run('/sbin/modprobe', self.name) if rc: if self.name == 'ptlrpcs_gss': print "Warning: not support gss security!" else: raise CommandError('modprobe', out, rc) def cleanup(self): """Unload module""" log('unloading module:', self.name) (rc, out) = run('/sbin/rmmod', self.name) if rc: log('unable to unload module:', self.name + "(" + self.refcount() + ")") logall(out) def info(self): """Returns module info if any.""" return kmod_info(self.name) def loaded(self): """Returns 1 if module is loaded. Otherwise 0 is returned.""" if self.info(): return 1 else: return 0 def refcount(self): """Returns module refcount.""" info = self.info() if not info: return '' return info[2] def used(self): """Returns 1 if module is used, otherwise 0 is returned.""" info = self.info() if not info: return 0 if len(info) > 3: users = info[3] if users and users != '(unused)' and users != '-': return 1 else: return 0 else: return 0 def busy(self): """Returns 1 if module is busy, otherwise 0 is returned.""" if self.loaded() and (self.used() or self.refcount() != '0'): return 1 else: return 0 class kmod_manager: """Manage kernel modules""" def __init__(self, lustre_dir, portals_dir): self.lustre_dir = lustre_dir self.portals_dir = portals_dir self.kmodule_list = [] def find_module(self, modname): """Find module by module name""" for mod in self.kmodule_list: if mod.name == modname: return mod return '' def add_portals_module(self, dev_dir, modname): """Append a module to list of modules to load.""" mod = self.find_module(modname) if not mod: mod = kmod(self.portals_dir, dev_dir, modname) self.kmodule_list.append(mod) def add_lustre_module(self, dev_dir, modname): """Append a module to list of modules to load.""" mod = self.find_module(modname) if not mod: mod = kmod(self.lustre_dir, dev_dir, modname) self.kmodule_list.append(mod) def load_modules(self): """Load all the modules in the list in the order they appear.""" for mod in self.kmodule_list: if mod.loaded() and not config.noexec: continue mod.load() def cleanup_modules(self): """Unload the modules in the list in reverse order.""" rev = self.kmodule_list rev.reverse() for mod in rev: if (not mod.loaded() or mod.busy()) and not config.noexec: continue # debug hack if mod.name == 'portals' and config.dump: lctl.dump(config.dump) mod.cleanup() # ============================================================ # Classes to prepare and cleanup the various objects # class Module: """ Base class for the rest of the modules. The default cleanup method is defined here, as well as some utilitiy funcs. """ def __init__(self, module_name, db): self.db = db self.module_name = module_name self.name = self.db.getName() self.uuid = self.db.getUUID() self._server = None self._connected = 0 def info(self, *args): msg = string.join(map(str,args)) print self.module_name + ":", self.name, self.uuid, msg def cleanup(self): """ default cleanup, used for most modules """ self.info() try: lctl.cleanup(self.name, self.uuid, config.force) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) def add_module(self, manager): """Adds all needed modules in the order they appear.""" return def safe_to_clean(self): return 1 def safe_to_clean_modules(self): return self.safe_to_clean() class Network(Module): def __init__(self,db): Module.__init__(self, 'NETWORK', db) self.net_type = self.db.get_val('nettype') self.nid = self.db.get_val('nid', '*') self.cluster_id = self.db.get_val('clusterid', "0") self.port = self.db.get_val_int('port', 0) if '*' in self.nid: self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id) if not self.nid: panic("unable to set nid for", self.net_type, self.nid, cluster_id) self.generic_nid = 1 debug("nid:", self.nid) else: self.generic_nid = 0 self.nid_uuid = self.nid_to_uuid(self.nid) self.hostaddr = self.db.get_hostaddr() if len(self.hostaddr) == 0: self.hostaddr.append(self.nid) if '*' in self.hostaddr[0]: self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id) if not self.hostaddr[0]: panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id) debug("hostaddr:", self.hostaddr[0]) def add_module(self, manager): manager.add_portals_module("libcfs", 'libcfs') manager.add_portals_module("portals", 'portals') if node_needs_router(): manager.add_portals_module("router", 'kptlrouter') if self.net_type == 'tcp': manager.add_portals_module("knals/socknal", 'ksocknal') if self.net_type == 'elan': manager.add_portals_module("knals/qswnal", 'kqswnal') if self.net_type == 'gm': manager.add_portals_module("knals/gmnal", 'kgmnal') if self.net_type == 'openib': manager.add_portals_module("knals/openibnal", 'kopenibnal') if self.net_type == 'iib': manager.add_portals_module("knals/iibnal", 'kiibnal') if self.net_type == 'vib': self.add_portals_module("knals/vibnal", 'kvibnal') if self.net_type == 'lo': manager.add_portals_module("knals/lonal", 'klonal') if self.net_type == 'ra': manager.add_portals_module("knals/ranal", 'kranal') def nid_to_uuid(self, nid): return "NID_%s_UUID" %(nid,) def prepare(self): if not config.record and net_is_prepared(): return self.info(self.net_type, self.nid, self.port) if not (config.record and self.generic_nid): lctl.network(self.net_type, self.nid) if self.net_type == 'tcp': sys_tweak_socknal() for hostaddr in self.db.get_hostaddr(): ip = string.split(hostaddr, '/')[0] if len(string.split(hostaddr, '/')) == 2: netmask = string.split(hostaddr, '/')[1] else: netmask = "" lctl.add_interface(self.net_type, ip, netmask) if self.net_type == 'elan': sys_optimize_elan() if self.port and node_is_router(): run_one_acceptor(self.port) self.connect_peer_gateways() def connect_peer_gateways(self): for router in self.db.lookup_class('node'): if router.get_val_int('router', 0): for netuuid in router.get_networks(): net = self.db.lookup(netuuid) gw = Network(net) if (gw.cluster_id == self.cluster_id and gw.net_type == self.net_type): if gw.nid != self.nid: lctl.connect(gw) def disconnect_peer_gateways(self): for router in self.db.lookup_class('node'): if router.get_val_int('router', 0): for netuuid in router.get_networks(): net = self.db.lookup(netuuid) gw = Network(net) if (gw.cluster_id == self.cluster_id and gw.net_type == self.net_type): if gw.nid != self.nid: try: lctl.disconnect(gw) except CommandError, e: print "disconnect failed: ", self.name e.dump() cleanup_error(e.rc) def safe_to_clean(self): return not net_is_prepared() def cleanup(self): self.info(self.net_type, self.nid, self.port) if self.port: stop_acceptor(self.port) if node_is_router(): self.disconnect_peer_gateways() if self.net_type == 'tcp': for hostaddr in self.db.get_hostaddr(): ip = string.split(hostaddr, '/')[0] lctl.del_interface(self.net_type, ip) def correct_level(self, level, op=None): return level class RouteTable(Module): def __init__(self,db): Module.__init__(self, 'ROUTES', db) def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi): # only setup connections for tcp, openib, and iib NALs srvdb = None if not net_type in ('tcp','openib','iib','vib','ra'): return None # connect to target if route is to single node and this node is the gw if lo == hi and local_interface(net_type, gw_cluster_id, gw): if not local_cluster(net_type, tgt_cluster_id): panic("target", lo, " not on the local cluster") srvdb = self.db.nid2server(lo, net_type, gw_cluster_id) # connect to gateway if this node is not the gw elif (local_cluster(net_type, gw_cluster_id) and not local_interface(net_type, gw_cluster_id, gw)): srvdb = self.db.nid2server(gw, net_type, gw_cluster_id) else: return None if not srvdb: panic("no server for nid", lo) return None return Network(srvdb) def prepare(self): if not config.record and net_is_prepared(): return self.info() for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): lctl.add_route(net_type, gw, lo, hi) srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi) if srv: lctl.connect(srv) def safe_to_clean(self): return not net_is_prepared() def cleanup(self): if net_is_prepared(): # the network is still being used, don't clean it up return for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi) if srv: try: lctl.disconnect(srv) except CommandError, e: print "disconnect failed: ", self.name e.dump() cleanup_error(e.rc) try: lctl.del_route(net_type, gw, lo, hi) except CommandError, e: print "del_route failed: ", self.name e.dump() cleanup_error(e.rc) class Management(Module): def __init__(self, db): Module.__init__(self, 'MGMT', db) def add_module(self, manager): manager.add_lustre_module('lvfs', 'lvfs') manager.add_lustre_module('obdclass', 'obdclass') manager.add_lustre_module('ptlrpc', 'ptlrpc') manager.add_lustre_module('mgmt', 'mgmt_svc') def prepare(self): if not config.record and is_prepared(self.name): return self.info() lctl.newdev("mgmt", self.name, self.uuid) def safe_to_clean(self): return 1 def cleanup(self): if is_prepared(self.name): Module.cleanup(self) def correct_level(self, level, op=None): return level # This is only needed to load the modules; the LDLM device # is now created automatically. class LDLM(Module): def __init__(self,db): Module.__init__(self, 'LDLM', db) def add_module(self, manager): manager.add_lustre_module('lvfs', 'lvfs') manager.add_lustre_module('obdclass', 'obdclass') manager.add_lustre_module('sec', 'ptlrpcs') manager.add_lustre_module('ptlrpc', 'ptlrpc') manager.add_lustre_module('sec/gss', 'ptlrpcs_gss') def prepare(self): return def cleanup(self): return def correct_level(self, level, op=None): return level class LOV(Module): def __init__(self, db, uuid, fs_name, name_override = None, config_only = None): Module.__init__(self, 'LOV', db) if name_override != None: self.name = "lov_%s" % name_override self.mds_uuid = self.db.get_first_ref('mds') self.stripe_sz = self.db.get_val_int('stripesize', 1048576) self.stripe_off = self.db.get_val_int('stripeoffset', 0) self.pattern = self.db.get_val_int('stripepattern', 0) self.devlist = self.db.get_lov_tgts('lov_tgt') self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) self.osclist = [] self.obdlist = [] self.desc_uuid = self.uuid self.uuid = generate_client_uuid(self.name) self.fs_name = fs_name if config_only: self.config_only = 1 return self.config_only = None mds = self.db.lookup(self.mds_uuid) self.mds_name = mds.getName() for (obd_uuid, index, gen, active) in self.devlist: if obd_uuid == '': continue self.obdlist.append(obd_uuid) obd = self.db.lookup(obd_uuid) osc = get_osc(obd, self.uuid, fs_name) if osc: self.osclist.append((osc, index, gen, active)) else: panic('osc not found:', obd_uuid) def get_uuid(self): return self.uuid def get_name(self): return self.name def prepare(self): if not config.record and is_prepared(self.name): return self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist, self.mds_name) lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, string.join(self.obdlist)) for (osc, index, gen, active) in self.osclist: target_uuid = osc.target_uuid try: # Only ignore connect failures with --force, which # isn't implemented here yet. osc.active = active osc.prepare(ignore_connect_failure=0) except CommandError, e: print "Error preparing OSC %s\n" % osc.uuid raise e lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen) def cleanup(self): for (osc, index, gen, active) in self.osclist: target_uuid = osc.target_uuid osc.cleanup() if is_prepared(self.name): Module.cleanup(self) if self.config_only: panic("Can't clean up config_only LOV ", self.name) def add_module(self, manager): if self.config_only: panic("Can't load modules for config_only LOV ", self.name) for (osc, index, gen, active) in self.osclist: osc.add_module(manager) break manager.add_lustre_module('lov', 'lov') def correct_level(self, level, op=None): return level class LMV(Module): def __init__(self, db, uuid, fs_name, name_override = None): Module.__init__(self, 'LMV', db) if name_override != None: self.name = "lmv_%s" % name_override self.devlist = self.db.get_lmv_tgts('lmv_tgt') if self.devlist == None: self.devlist = self.db.get_refs('mds') self.mdclist = [] self.desc_uuid = self.uuid self.uuid = uuid self.fs_name = fs_name for mds_uuid in self.devlist: mds = self.db.lookup(mds_uuid) if not mds: panic("MDS not found!") mdc = MDC(mds, self.uuid, fs_name) if mdc: self.mdclist.append(mdc) else: panic('mdc not found:', mds_uuid) def prepare(self): if is_prepared(self.name): return self.info(); for mdc in self.mdclist: try: # Only ignore connect failures with --force, which # isn't implemented here yet. mdc.prepare(ignore_connect_failure=0) except CommandError, e: print "Error preparing LMV %s\n" % mdc.uuid raise e lctl.lmv_setup(self.name, self.uuid, self.desc_uuid, string.join(self.devlist)) def cleanup(self): for mdc in self.mdclist: mdc.cleanup() if is_prepared(self.name): Module.cleanup(self) def add_module(self, manager): for mdc in self.mdclist: mdc.add_module(manager) break manager.add_lustre_module('lmv', 'lmv') def correct_level(self, level, op=None): return level class CONFDEV(Module): def __init__(self, db, name, target_uuid, uuid): Module.__init__(self, 'CONFDEV', db) self.devpath = self.db.get_val('devpath','') self.backdevpath = self.db.get_val('devpath','') self.size = self.db.get_val_int('devsize', 0) self.journal_size = self.db.get_val_int('journalsize', 0) self.fstype = self.db.get_val('fstype', '') self.backfstype = self.db.get_val('backfstype', '') self.mkfsoptions = self.db.get_val('mkfsoptions', '') self.mountfsoptions = self.db.get_val('mountfsoptions', '') self.target = self.db.lookup(target_uuid) self.name = "conf_%s" % self.target.getName() self.client_uuids = self.target.get_refs('client') self.obdtype = self.db.get_val('obdtype', '') if self.obdtype == None: self.obdtype = 'dumb' self.conf_name = name self.conf_uuid = uuid self.realdev = self.devpath self.lmv = None self.master = None lmv_uuid = self.db.get_first_ref('lmv') if lmv_uuid != None: self.lmv = self.db.lookup(lmv_uuid) if self.lmv != None: self.client_uuids = self.lmv.get_refs('client') if self.target.get_class() == 'mds': if self.target.get_val('failover', 0): self.failover_mds = 'f' else: self.failover_mds = 'n' self.format = self.db.get_val('autoformat', "no") else: self.format = self.db.get_val('autoformat', "yes") self.osdtype = self.db.get_val('osdtype') ost = self.db.lookup(target_uuid) if ost.get_val('failover', 0): self.failover_ost = 'f' else: self.failover_ost = 'n' self.inode_size = self.get_inode_size() if self.lmv != None: client_uuid = self.name + "_lmv_UUID" self.master = LMV(self.lmv, client_uuid, self.conf_name, self.conf_name) def get_inode_size(self): inode_size = self.db.get_val_int('inodesize', 0) if inode_size == 0 and self.target.get_class() == 'mds': # default inode size for case when neither LOV either # LMV is accessible. self.inode_size = 256 # find the LOV for this MDS lovconfig_uuid = self.target.get_first_ref('lovconfig') if lovconfig_uuid or self.lmv != None: if self.lmv != None: lovconfig_uuid = self.lmv.get_first_ref('lovconfig') lovconfig = self.lmv.lookup(lovconfig_uuid) lov_uuid = lovconfig.get_first_ref('lov') if lov_uuid == None: panic(self.target.getName() + ": No LOV found for lovconfig ", lovconfig.name) else: lovconfig = self.target.lookup(lovconfig_uuid) lov_uuid = lovconfig.get_first_ref('lov') if lov_uuid == None: panic(self.target.getName() + ": No LOV found for lovconfig ", lovconfig.name) if self.lmv != None: lovconfig_uuid = self.lmv.get_first_ref('lovconfig') lovconfig = self.lmv.lookup(lovconfig_uuid) lov_uuid = lovconfig.get_first_ref('lov') lov = LOV(self.db.lookup(lov_uuid), lov_uuid, self.name, config_only = 1) # default stripe count controls default inode_size if lov.stripe_cnt > 0: stripe_count = lov.stripe_cnt else: stripe_count = len(lov.devlist) if stripe_count > 77: inode_size = 4096 elif stripe_count > 35: inode_size = 2048 elif stripe_count > 13: inode_size = 1024 elif stripe_count > 3: inode_size = 512 else: inode_size = 256 return inode_size def get_mount_options(self, blkdev): options = def_mount_options(self.fstype, self.target.get_class()) if config.mountfsoptions: if options: options = "%s,%s" %(options, config.mountfsoptions) else: options = config.mountfsoptions if self.mountfsoptions: options = "%s,%s" %(options, self.mountfsoptions) else: if self.mountfsoptions: if options: options = "%s,%s" %(options, self.mountfsoptions) else: options = self.mountfsoptions if self.fstype == 'smfs': if options: options = "%s,type=%s,dev=%s" %(options, self.backfstype, blkdev) else: options = "type=%s,dev=%s" %(self.backfstype, blkdev) if self.target.get_class() == 'mds': if options: options = "%s,acl,user_xattr,iopen_nopriv" %(options) else: options = "iopen_nopriv" return options def prepare(self): if is_prepared(self.name): return blkdev = block_dev(self.devpath, self.size, self.fstype, config.reformat, self.format, self.journal_size, self.inode_size, self.mkfsoptions, self.backfstype, self.backdevpath) if self.fstype == 'smfs': realdev = blkdev else: realdev = blkdev mountfsoptions = self.get_mount_options(blkdev) self.info(self.target.get_class(), realdev, mountfsoptions, self.fstype, self.size, self.format) lctl.newdev("confobd", self.name, self.uuid, setup ="%s %s %s" %(realdev, self.fstype, mountfsoptions)) self.mountfsoptions = mountfsoptions self.realdev = realdev def add_module(self, manager): manager.add_lustre_module('obdclass', 'confobd') def write_conf(self): if self.target.get_class() == 'ost': config.record = 1 lctl.clear_log(self.name, self.target.getName() + '-conf') lctl.record(self.name, self.target.getName() + '-conf') lctl.newdev(self.osdtype, self.conf_name, self.conf_uuid, setup ="%s %s %s %s" %(self.realdev, self.fstype, self.failover_ost, self.mountfsoptions)) lctl.end_record() lctl.clear_log(self.name, 'OSS-conf') lctl.record(self.name, 'OSS-conf') lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="") lctl.end_record() config.record = 0 return if self.target.get_class() == 'mds': if self.master != None: master_name = self.master.name else: master_name = 'dumb' config.record = 1 lctl.clear_log(self.name, self.target.getName() + '-conf') lctl.record(self.name, self.target.getName() + '-conf') lctl.newdev("mds", self.conf_name, self.conf_uuid, setup ="%s %s %s %s %s %s" %(self.realdev, self.fstype, self.conf_name, self.mountfsoptions, master_name, self.obdtype)) lctl.end_record() config.record = 0 if not self.client_uuids: return 0 for uuid in self.client_uuids: log("recording client:", uuid) client_uuid = generate_client_uuid(self.name) client = VOSC(self.db.lookup(uuid), client_uuid, self.target.getName(), self.name) config.record = 1 lctl.clear_log(self.name, self.target.getName()) lctl.record(self.name, self.target.getName()) client.prepare() lctl.mount_option(self.target.getName(), client.get_name(), "") lctl.end_record() config.cleanup = 1 lctl.clear_log(self.name, self.target.getName() + '-clean') lctl.record(self.name, self.target.getName() + '-clean') client.cleanup() lctl.del_mount_option(self.target.getName()) lctl.end_record() config.cleanup = 0 config.record = 0 if config.record: return # record logs for each client if config.ldapurl: config_options = "--ldapurl " + config.ldapurl + " --config " + config.config else: config_options = CONFIG_FILE for node_db in self.db.lookup_class('node'): client_name = node_db.getName() for prof_uuid in node_db.get_refs('profile'): prof_db = node_db.lookup(prof_uuid) # refactor this into a funtion to test "clientness" # of a node. for ref_class, ref_uuid in prof_db.get_all_refs(): if ref_class in ('mountpoint','echoclient'): debug("recording", client_name) old_noexec = config.noexec config.noexec = 0 noexec_opt = ('', '-n') ret, out = run (sys.argv[0], noexec_opt[old_noexec == 1], " -v --record --nomod", "--record_log", client_name, "--record_device", self.name, "--node", client_name, config_options) if config.verbose: for s in out: log("record> ", string.strip(s)) ret, out = run (sys.argv[0], noexec_opt[old_noexec == 1], "--cleanup -v --record --nomod", "--record_log", client_name + "-clean", "--record_device", self.name, "--node", client_name, config_options) if config.verbose: for s in out: log("record> ", string.strip(s)) config.noexec = old_noexec def start(self): try: lctl.start(self.name, self.conf_name) except CommandError, e: raise e if self.target.get_class() == 'ost': if not is_prepared('OSS'): try: lctl.start(self.name, 'OSS') except CommandError, e: raise e def cleanup(self): if is_prepared(self.name): try: lctl.cleanup(self.name, self.uuid, 0, 0) clean_dev(self.devpath, self.fstype, self.backfstype, self.backdevpath) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) Module.cleanup(self) class MDSDEV(Module): def __init__(self,db): Module.__init__(self, 'MDSDEV', db) self.devpath = self.db.get_val('devpath','') self.backdevpath = self.db.get_val('devpath','') self.size = self.db.get_val_int('devsize', 0) self.journal_size = self.db.get_val_int('journalsize', 0) self.fstype = self.db.get_val('fstype', '') self.backfstype = self.db.get_val('backfstype', '') self.nspath = self.db.get_val('nspath', '') self.mkfsoptions = self.db.get_val('mkfsoptions', '') self.mountfsoptions = self.db.get_val('mountfsoptions', '') self.obdtype = self.db.get_val('obdtype', '') self.root_squash = self.db.get_val('root_squash', '') self.no_root_squash = self.db.get_val('no_root_squash', '') target_uuid = self.db.get_first_ref('target') self.target = self.db.lookup(target_uuid) self.name = self.target.getName() self.master = None self.lmv = None lmv_uuid = self.db.get_first_ref('lmv') if lmv_uuid != None: self.lmv = self.db.lookup(lmv_uuid) active_uuid = get_active_target(self.target) if not active_uuid: panic("No target device found:", target_uuid) if active_uuid == self.uuid: self.active = 1 group = self.target.get_val('group') if config.group and config.group != group: self.active = 0 else: self.active = 0 self.uuid = target_uuid # setup LMV if self.lmv != None: client_uuid = self.name + "_lmv_UUID" self.master = LMV(self.lmv, client_uuid, self.name, self.name) self.confobd = CONFDEV(self.db, self.name, target_uuid, self.uuid) def add_module(self, manager): if self.active: manager.add_lustre_module('mdc', 'mdc') manager.add_lustre_module('osc', 'osc') manager.add_lustre_module('ost', 'ost') manager.add_lustre_module('lov', 'lov') manager.add_lustre_module('mds', 'mds') if self.fstype == 'smfs' or self.fstype == 'ldiskfs': manager.add_lustre_module(self.fstype, self.fstype) if self.fstype: manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype)) # if fstype is smfs, then we should also take care about backing # store fs. if self.fstype == 'smfs': manager.add_lustre_module(self.backfstype, self.backfstype) manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype)) for option in string.split(self.mountfsoptions, ','): if option == 'snap': if not self.fstype == 'smfs': panic("mountoptions has 'snap', but fstype is not smfs.") manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype)) manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype)) # add LMV modules if self.master != None: self.master.add_module(manager) # add CONFOBD modules if self.confobd != None: self.confobd.add_module(manager) def write_conf(self): if is_prepared(self.name): return if not self.active: debug(self.uuid, "not active") return run_acceptors() self.confobd.prepare() self.confobd.write_conf() self.confobd.cleanup() def prepare(self): if is_prepared(self.name): return if not self.active: debug(self.uuid, "not active") return run_acceptors() self.confobd.prepare() if config.reformat: self.confobd.write_conf() # prepare LMV if self.master != None: self.master.prepare() lctl.attach("mds", self.name, self.uuid) if config.mds_mds_sec: lctl.set_security(self.name, "mds_mds_sec", config.mds_mds_sec) if config.mds_ost_sec: lctl.set_security(self.name, "mds_ost_sec", config.mds_ost_sec) lctl.detach(self.name) if not config.record: self.confobd.start() if not is_prepared('MDT'): lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="") if development_mode(): procentry = "/proc/fs/lustre/mds/lsd_upcall" upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall") if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)): print "MDS Warning: failed to set lsd cache upcall" else: run("echo ", upcall, " > ", procentry) if config.root_squash == None: config.root_squash = self.root_squash if config.no_root_squash == None: config.no_root_squash = self.no_root_squash if config.root_squash: if config.no_root_squash: nsnid = config.no_root_squash else: nsnid = "0" lctl.root_squash(self.name, config.root_squash, nsnid) def msd_remaining(self): out = lctl.device_list() for s in out: if string.split(s)[2] in ('mds',): return 1 def safe_to_clean(self): return self.active def safe_to_clean_modules(self): return not self.msd_remaining() def cleanup(self): if not self.active: debug(self.uuid, "not active") return self.info() if is_prepared(self.name): try: lctl.cleanup(self.name, self.uuid, config.force, config.failover) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) Module.cleanup(self) # cleanup LMV if self.master != None: self.master.cleanup() if not self.msd_remaining() and is_prepared('MDT'): try: lctl.cleanup("MDT", "MDT_UUID", config.force, config.failover) except CommandError, e: print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) if self.confobd: self.confobd.cleanup() def correct_level(self, level, op=None): #if self.master != None: # level = level + 2 return level class OSD(Module): def __init__(self, db): Module.__init__(self, 'OSD', db) self.osdtype = self.db.get_val('osdtype') self.devpath = self.db.get_val('devpath', '') self.backdevpath = self.db.get_val('devpath', '') self.size = self.db.get_val_int('devsize', 0) self.journal_size = self.db.get_val_int('journalsize', 0) self.inode_size = self.db.get_val_int('inodesize', 0) self.mkfsoptions = self.db.get_val('mkfsoptions', '') self.mountfsoptions = self.db.get_val('mountfsoptions', '') self.fstype = self.db.get_val('fstype', '') self.backfstype = self.db.get_val('backfstype', '') self.nspath = self.db.get_val('nspath', '') target_uuid = self.db.get_first_ref('target') ost = self.db.lookup(target_uuid) self.name = ost.getName() self.format = self.db.get_val('autoformat', 'yes') if ost.get_val('failover', 0): self.failover_ost = 'f' else: self.failover_ost = 'n' active_uuid = get_active_target(ost) if not active_uuid: panic("No target device found:", target_uuid) if active_uuid == self.uuid: self.active = 1 group = ost.get_val('group') if config.group and config.group != group: self.active = 0 else: self.active = 0 self.uuid = target_uuid self.confobd = CONFDEV(self.db, self.name, target_uuid, self.uuid) def add_module(self, manager): if not self.active: return manager.add_lustre_module('ost', 'ost') if self.fstype == 'smfs' or self.fstype == 'ldiskfs': manager.add_lustre_module(self.fstype, self.fstype) if self.fstype: manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype)) if self.fstype == 'smfs': manager.add_lustre_module(self.backfstype, self.backfstype) manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype)) for option in self.mountfsoptions: if option == 'snap': if not self.fstype == 'smfs': panic("mountoptions with snap, but fstype is not smfs\n") manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype)) manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype)) manager.add_lustre_module(self.osdtype, self.osdtype) # add CONFOBD modules if self.confobd != None: self.confobd.add_module(manager) def prepare(self): if is_prepared(self.name): return if not self.active: debug(self.uuid, "not active") return run_acceptors() if self.osdtype == 'obdecho': self.info(self.osdtype) lctl.newdev("obdecho", self.name, self.uuid) if not is_prepared('OSS'): lctl.newdev("ost", 'OSS', 'OSS_UUID', setup="") else: self.confobd.prepare() if config.reformat: self.confobd.write_conf() if not config.record: self.confobd.start() def write_conf(self): if is_prepared(self.name): return if not self.active: debug(self.uuid, "not active") return run_acceptors() if self.osdtype != 'obdecho': self.confobd.prepare() self.confobd.write_conf() if not config.write_conf: self.confobd.start() self.confobd.cleanup() def osd_remaining(self): out = lctl.device_list() for s in out: if string.split(s)[2] in ('obdfilter', 'obdecho'): return 1 def safe_to_clean(self): return self.active def safe_to_clean_modules(self): return not self.osd_remaining() def cleanup(self): if not self.active: debug(self.uuid, "not active") return if is_prepared(self.name): self.info() try: lctl.cleanup(self.name, self.uuid, config.force, config.failover) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) if not self.osd_remaining() and is_prepared('OSS'): try: lctl.cleanup("OSS", "OSS_UUID", config.force, config.failover) except CommandError, e: print "cleanup failed: ", self.name e.dump() cleanup_error(e.rc) if self.osdtype != 'obdecho': if self.confobd: self.confobd.cleanup() def correct_level(self, level, op=None): return level # Generic client module, used by OSC and MDC class Client(Module): def __init__(self, tgtdb, uuid, module, fs_name, self_name=None, module_dir=None): self.target_name = tgtdb.getName() self.target_uuid = tgtdb.getUUID() self.module_dir = module_dir self.backup_targets = [] self.module = module self.db = tgtdb self.tgt_dev_uuid = get_active_target(tgtdb) if not self.tgt_dev_uuid: panic("No target device found for target(1):", self.target_name) self._server = None self._connected = 0 self.module = module self.module_name = string.upper(module) if not self_name: self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(), self.target_name, fs_name) else: self.name = self_name self.uuid = uuid self.lookup_server(self.tgt_dev_uuid) self.lookup_backup_targets() self.fs_name = fs_name if not self.module_dir: self.module_dir = module def add_module(self, manager): manager.add_lustre_module(self.module_dir, self.module) def lookup_server(self, srv_uuid): """ Lookup a server's network information """ self._server_nets = get_ost_net(self.db, srv_uuid) if len(self._server_nets) == 0: panic ("Unable to find a server for:", srv_uuid) def get_name(self): return self.name def get_servers(self): return self._server_nets def lookup_backup_targets(self): """ Lookup alternative network information """ prof_list = toplustreDB.get_refs('profile') for prof_uuid in prof_list: prof_db = toplustreDB.lookup(prof_uuid) if not prof_db: panic("profile:", prof_uuid, "not found.") for ref_class, ref_uuid in prof_db.get_all_refs(): if ref_class in ('osd', 'mdsdev'): devdb = toplustreDB.lookup(ref_uuid) uuid = devdb.get_first_ref('target') if self.target_uuid == uuid and self.tgt_dev_uuid != ref_uuid: self.backup_targets.append(ref_uuid) def prepare(self, ignore_connect_failure = 0): self.info(self.target_uuid) if not config.record and is_prepared(self.name): self.cleanup() try: srv = choose_local_server(self.get_servers()) if srv: lctl.connect(srv) else: routes = find_route(self.get_servers()) if len(routes) == 0: panic ("no route to", self.target_uuid) for (srv, r) in routes: lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3]) except CommandError, e: if not ignore_connect_failure: raise e if srv: if self.target_uuid in config.inactive and self.permits_inactive(): debug("%s inactive" % self.target_uuid) inactive_p = "inactive" else: debug("%s active" % self.target_uuid) inactive_p = "" lctl.newdev(self.module, self.name, self.uuid, setup ="%s %s %s" % (self.target_uuid, srv.nid_uuid, inactive_p)) for tgt_dev_uuid in self.backup_targets: this_nets = get_ost_net(toplustreDB, tgt_dev_uuid) if len(this_nets) == 0: panic ("Unable to find a server for:", tgt_dev_uuid) srv = choose_local_server(this_nets) if srv: lctl.connect(srv) else: routes = find_route(this_nets); if len(routes) == 0: panic("no route to", tgt_dev_uuid) for (srv, r) in routes: lctl.add_route_host(r[0]. srv.nid_uuid, r[1], r[3]) if srv: lctl.add_conn(self.name, srv.nid_uuid); def cleanup(self): if is_prepared(self.name): Module.cleanup(self) try: srv = choose_local_server(self.get_servers()) if srv: lctl.disconnect(srv) else: for (srv, r) in find_route(self.get_servers()): lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3]) except CommandError, e: log(self.module_name, "cleanup failed: ", self.name) e.dump() cleanup_error(e.rc) for tgt_dev_uuid in self.backup_targets: this_net = get_ost_net(toplustreDB, tgt_dev_uuid) srv = choose_local_server(this_net) if srv: lctl.disconnect(srv) else: for (srv, r) in find_route(this_net): lctl.del_route_host(r[0]. srv.nid_uuid, r[1], r[3]) def correct_level(self, level, op=None): return level def deactivate(self): try: lctl.deactivate(self.name) except CommandError, e: log(self.module_name, "deactivate failed: ", self.name) e.dump() cleanup_error(e.rc) class MDC(Client): def __init__(self, db, uuid, fs_name): Client.__init__(self, db, uuid, 'mdc', fs_name) def permits_inactive(self): return 0 class OSC(Client): def __init__(self, db, uuid, fs_name): Client.__init__(self, db, uuid, 'osc', fs_name) def permits_inactive(self): return 1 class CMOBD(Module): def __init__(self, db): Module.__init__(self, 'CMOBD', db) self.name = self.db.getName(); self.uuid = generate_client_uuid(self.name) self.master_uuid = self.db.get_first_ref('masterobd') self.cache_uuid = self.db.get_first_ref('cacheobd') master_obd = self.db.lookup(self.master_uuid) if not master_obd: panic('master obd not found:', self.master_uuid) cache_obd = self.db.lookup(self.cache_uuid) if not cache_obd: panic('cache obd not found:', self.cache_uuid) self.master = None self.cache = None master_class = master_obd.get_class() cache_class = cache_obd.get_class() if master_class == 'ost' or master_class == 'lov': client_uuid = "%s_lov_master_UUID" % (self.name) self.master = LOV(master_obd, client_uuid, self.name); elif master_class == 'mds': self.master = get_mdc(db, self.name, self.master_uuid) elif master_class == 'lmv': client_uuid = "%s_lmv_master_UUID" % (self.name) self.master = LMV(master_obd, client_uuid, self.name); else: panic("unknown master obd class '%s'" %(master_class)) if cache_class == 'ost' or cache_class == 'lov': client_uuid = "%s_lov_cache_UUID" % (self.name) self.cache = LOV(cache_obd, client_uuid, self.name); elif cache_class == 'mds': self.cache = get_mdc(db, self.name, self.cache_uuid) elif cache_class == 'lmv': client_uuid = "%s_lmv_cache_UUID" % (self.name) self.cache = LMV(cache_obd, client_uuid, self.name); else: panic("unknown cache obd class '%s'" %(cache_class)) def prepare(self): self.master.prepare() if not config.record and is_prepared(self.name): return self.info(self.master_uuid, self.cache_uuid) lctl.newdev("cmobd", self.name, self.uuid, setup ="%s %s" %(self.master.uuid, self.cache.uuid)) def get_uuid(self): return self.uuid def get_name(self): return self.name def get_master_name(self): return self.master.name def get_cache_name(self): return self.cache.name def cleanup(self): if is_prepared(self.name): Module.cleanup(self) if self.master: self.master.cleanup() def add_module(self, manager): manager.add_lustre_module('cmobd', 'cmobd') self.master.add_module(manager) def correct_level(self, level, op=None): return level class COBD(Module): def __init__(self, db, uuid, name): Module.__init__(self, 'COBD', db) self.name = self.db.getName(); self.uuid = generate_client_uuid(self.name) self.master_uuid = self.db.get_first_ref('masterobd') self.cache_uuid = self.db.get_first_ref('cacheobd') master_obd = self.db.lookup(self.master_uuid) if not master_obd: panic('master obd not found:', self.master_uuid) cache_obd = self.db.lookup(self.cache_uuid) if not cache_obd: panic('cache obd not found:', self.cache_uuid) self.master = None self.cache = None master_class = master_obd.get_class() cache_class = cache_obd.get_class() if master_class == 'ost' or master_class == 'lov': client_uuid = "%s_lov_master_UUID" % (self.name) self.master = LOV(master_obd, client_uuid, name); elif master_class == 'mds': self.master = get_mdc(db, name, self.master_uuid) elif master_class == 'lmv': client_uuid = "%s_lmv_master_UUID" % (self.name) self.master = LMV(master_obd, client_uuid, self.name); else: panic("unknown master obd class '%s'" %(master_class)) if cache_class == 'ost' or cache_class == 'lov': client_uuid = "%s_lov_cache_UUID" % (self.name) self.cache = LOV(cache_obd, client_uuid, name); elif cache_class == 'mds': self.cache = get_mdc(db, name, self.cache_uuid) elif cache_class == 'lmv': client_uuid = "%s_lmv_cache_UUID" % (self.name) self.cache = LMV(cache_obd, client_uuid, self.name); else: panic("unknown cache obd class '%s'" %(cache_class)) def get_uuid(self): return self.uuid def get_name(self): return self.name def get_master_name(self): return self.master.name def get_cache_name(self): return self.cache.name def prepare(self): self.master.prepare() self.cache.prepare() if not config.record and is_prepared(self.name): return self.info(self.master_uuid, self.cache_uuid) lctl.newdev("cobd", self.name, self.uuid, setup ="%s %s" %(self.master.name, self.cache.name)) def cleanup(self): if is_prepared(self.name): Module.cleanup(self) self.master.cleanup() self.cache.cleanup() def add_module(self, manager): manager.add_lustre_module('cobd', 'cobd') self.master.add_module(manager) # virtual interface for OSC and LOV class VOSC(Module): def __init__(self, db, client_uuid, name, name_override = None): Module.__init__(self, 'VOSC', db) if db.get_class() == 'lov': self.osc = LOV(db, client_uuid, name, name_override) self.type = 'lov' elif db.get_class() == 'cobd': self.osc = COBD(db, client_uuid, name) self.type = 'cobd' else: self.osc = OSC(db, client_uuid, name) self.type = 'osc' def get_uuid(self): return self.osc.get_uuid() def get_name(self): return self.osc.get_name() def prepare(self): self.osc.prepare() def cleanup(self): self.osc.cleanup() def add_module(self, manager): self.osc.add_module(manager) def correct_level(self, level, op=None): return self.osc.correct_level(level, op) # virtual interface for MDC and LMV class VMDC(Module): def __init__(self, db, client_uuid, name, name_override = None): Module.__init__(self, 'VMDC', db) if db.get_class() == 'lmv': self.mdc = LMV(db, client_uuid, name, name_override) elif db.get_class() == 'cobd': self.mdc = COBD(db, client_uuid, name) else: self.mdc = MDC(db, client_uuid, name) def get_uuid(self): return self.mdc.uuid def get_name(self): return self.mdc.name def prepare(self): self.mdc.prepare() def cleanup(self): self.mdc.cleanup() def add_module(self, manager): self.mdc.add_module(manager) def correct_level(self, level, op=None): return self.mdc.correct_level(level, op) class ECHO_CLIENT(Module): def __init__(self,db): Module.__init__(self, 'ECHO_CLIENT', db) self.obd_uuid = self.db.get_first_ref('obd') obd = self.db.lookup(self.obd_uuid) self.uuid = generate_client_uuid(self.name) self.osc = VOSC(obd, self.uuid, self.name) def prepare(self): if not config.record and is_prepared(self.name): return run_acceptors() self.osc.prepare() # XXX This is so cheating. -p self.info(self.obd_uuid) lctl.newdev("echo_client", self.name, self.uuid, setup = self.osc.get_name()) def cleanup(self): if is_prepared(self.name): Module.cleanup(self) self.osc.cleanup() def add_module(self, manager): self.osc.add_module(manager) manager.add_lustre_module('obdecho', 'obdecho') def correct_level(self, level, op=None): return level def generate_client_uuid(name): client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576), name, int(random.random() * 1048576), int(random.random() * 1048576)) return client_uuid[:36] class Mountpoint(Module): def __init__(self,db): Module.__init__(self, 'MTPT', db) self.path = self.db.get_val('path') self.clientoptions = self.db.get_val('clientoptions', '') self.fs_uuid = self.db.get_first_ref('filesystem') fs = self.db.lookup(self.fs_uuid) self.mds_uuid = fs.get_first_ref('lmv') if not self.mds_uuid: self.mds_uuid = fs.get_first_ref('mds') self.obd_uuid = fs.get_first_ref('obd') client_uuid = generate_client_uuid(self.name) ost = self.db.lookup(self.obd_uuid) if not ost: panic("no ost: ", self.obd_uuid) mds = self.db.lookup(self.mds_uuid) if not mds: panic("no mds: ", self.mds_uuid) self.vosc = VOSC(ost, client_uuid, self.name, self.name) self.vmdc = VMDC(mds, client_uuid, self.name, self.name) def prepare(self): if not config.record and fs_is_mounted(self.path): log(self.path, "already mounted.") return run_acceptors() self.vosc.prepare() self.vmdc.prepare() self.info(self.path, self.mds_uuid, self.obd_uuid) if config.record or config.lctl_dump: lctl.mount_option(local_node_name, self.vosc.get_name(), self.vmdc.get_name()) return if config.clientoptions: if self.clientoptions: self.clientoptions = self.clientoptions + ',' + config.clientoptions else: self.clientoptions = config.clientoptions if self.clientoptions: self.clientoptions = ',' + self.clientoptions # Linux kernel will deal with async and not pass it to ll_fill_super, # so replace it with Lustre async self.clientoptions = string.replace(self.clientoptions, "async", "lasync") if not config.sec: config.sec = "null" cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,sec=%s%s %s %s" % \ (self.vosc.get_name(), self.vmdc.get_name(), config.sec, self.clientoptions, config.config, self.path) run("mkdir", self.path) ret, val = run(cmd) if ret: self.vmdc.cleanup() self.vosc.cleanup() panic("mount failed:", self.path, ":", string.join(val)) def cleanup(self): self.info(self.path, self.mds_uuid,self.obd_uuid) if config.record or config.lctl_dump: lctl.del_mount_option(local_node_name) else: if fs_is_mounted(self.path): if config.force: (rc, out) = run("umount", "-f", self.path) else: (rc, out) = run("umount", self.path) if rc: raise CommandError('umount', out, rc) if fs_is_mounted(self.path): panic("fs is still mounted:", self.path) self.vmdc.cleanup() self.vosc.cleanup() def add_module(self, manager): self.vosc.add_module(manager) self.vmdc.add_module(manager) manager.add_lustre_module('llite', 'llite') def correct_level(self, level, op=None): return level # ============================================================ # misc query functions def get_ost_net(self, osd_uuid): srv_list = [] if not osd_uuid: return srv_list osd = self.lookup(osd_uuid) node_uuid = osd.get_first_ref('node') node = self.lookup(node_uuid) if not node: panic("unable to find node for osd_uuid:", osd_uuid, " node_ref:", node_uuid_) for net_uuid in node.get_networks(): db = node.lookup(net_uuid) srv_list.append(Network(db)) return srv_list # the order of iniitailization is based on level. def getServiceLevel(self): type = self.get_class() ret=0; if type in ('network',): ret = 5 elif type in ('routetbl',): ret = 6 elif type in ('ldlm',): ret = 20 elif type in ('osd', 'cobd'): ret = 30 elif type in ('mdsdev',): ret = 40 elif type in ('lmv',): ret = 45 elif type in ('cmobd',): ret = 50 elif type in ('mountpoint', 'echoclient'): ret = 70 else: panic("Unknown type: ", type) if ret < config.minlevel or ret > config.maxlevel: ret = 0 return ret # # return list of services in a profile. list is a list of tuples # [(level, db_object),] def getServices(self): list = [] for ref_class, ref_uuid in self.get_all_refs(): servdb = self.lookup(ref_uuid) if servdb: level = getServiceLevel(servdb) if level > 0: list.append((level, servdb)) else: panic('service not found: ' + ref_uuid) list.sort() return list ############################################################ # MDC UUID hack - # FIXME: clean this mess up! # # OSC is no longer in the xml, so we have to fake it. # this is getting ugly and begging for another refactoring def get_osc(ost_db, uuid, fs_name): osc = OSC(ost_db, uuid, fs_name) return osc def get_mdc(db, fs_name, mds_uuid): mds_db = db.lookup(mds_uuid); if not mds_db: error("no mds:", mds_uuid) mdc = MDC(mds_db, mds_uuid, fs_name) return mdc ############################################################ # routing ("rooting") # list of (nettype, cluster_id, nid) local_clusters = [] def find_local_clusters(node_db): global local_clusters for netuuid in node_db.get_networks(): net = node_db.lookup(netuuid) srv = Network(net) debug("add_local", netuuid) local_clusters.append((srv.net_type, srv.cluster_id, srv.nid)) if srv.port > 0: if not acceptors.has_key(srv.port): acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type) # This node is a gateway. is_router = 0 def node_is_router(): return is_router # If there are any routers found in the config, then this will be true # and all nodes will load kptlrouter. needs_router = 0 def node_needs_router(): return needs_router or is_router # list of (nettype, gw, tgt_cluster_id, lo, hi) # Currently, these local routes are only added to kptlrouter route # table if they are needed to connect to a specific server. This # should be changed so all available routes are loaded, and the # ptlrouter can make all the decisions. local_routes = [] def find_local_routes(lustre): """ Scan the lustre config looking for routers . Build list of routes. """ global local_routes, needs_router local_routes = [] list = lustre.lookup_class('node') for router in list: if router.get_val_int('router', 0): needs_router = 1 for (local_type, local_cluster_id, local_nid) in local_clusters: gw = None for netuuid in router.get_networks(): db = router.lookup(netuuid) if (local_type == db.get_val('nettype') and local_cluster_id == db.get_val('clusterid')): gw = db.get_val('nid') break if gw: debug("find_local_routes: gw is", gw) for route in router.get_local_routes(local_type, gw): local_routes.append(route) debug("find_local_routes:", local_routes) def choose_local_server(srv_list): for srv in srv_list: if local_cluster(srv.net_type, srv.cluster_id): return srv def local_cluster(net_type, cluster_id): for cluster in local_clusters: if net_type == cluster[0] and cluster_id == cluster[1]: return 1 return 0 def local_interface(net_type, cluster_id, nid): for cluster in local_clusters: if (net_type == cluster[0] and cluster_id == cluster[1] and nid == cluster[2]): return 1 return 0 def find_route(srv_list): result = [] frm_type = local_clusters[0][0] for srv in srv_list: debug("find_route: srv:", srv.nid, "type: ", srv.net_type) to_type = srv.net_type to = srv.nid cluster_id = srv.cluster_id debug ('looking for route to', to_type, to) for r in local_routes: debug("find_route: ", r) if (r[3] <= to and to <= r[4]) and cluster_id == r[2]: result.append((srv, r)) return result def get_active_target(db): target_uuid = db.getUUID() target_name = db.getName() node_name = get_select(target_name) if node_name: tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid) else: tgt_dev_uuid = db.get_first_ref('active') return tgt_dev_uuid def get_server_by_nid_uuid(db, nid_uuid): for n in db.lookup_class("network"): net = Network(n) if net.nid_uuid == nid_uuid: return net ############################################################ # lconf level logic # Start a service. def newService(db): type = db.get_class() debug('Service:', type, db.getName(), db.getUUID()) n = None if type == 'ldlm': n = LDLM(db) elif type == 'lov': n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID") elif type == 'network': n = Network(db) elif type == 'routetbl': n = RouteTable(db) elif type == 'osd': n = OSD(db) elif type == 'cobd': n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID") elif type == 'cmobd': n = CMOBD(db) elif type == 'mdsdev': n = MDSDEV(db) elif type == 'mountpoint': n = Mountpoint(db) elif type == 'echoclient': n = ECHO_CLIENT(db) elif type == 'lmv': n = LMV(db) else: panic ("unknown service type:", type) return n # # Prepare the system to run lustre using a particular profile # in a the configuration. # * load & the modules # * setup networking for the current node # * make sure partitions are in place and prepared # * initialize devices with lctl # Levels is important, and needs to be enforced. def for_each_profile(db, prof_list, operation): for prof_uuid in prof_list: prof_db = db.lookup(prof_uuid) if not prof_db: panic("profile:", prof_uuid, "not found.") services = getServices(prof_db) operation(services) def magic_get_osc(db, rec, lov): if lov: lov_uuid = lov.get_uuid() lov_name = lov.osc.fs_name else: lov_uuid = rec.getAttribute('lov_uuidref') # FIXME: better way to find the mountpoint? filesystems = db.root_node.getElementsByTagName('filesystem') fsuuid = None for fs in filesystems: ref = fs.getElementsByTagName('obd_ref') if ref[0].getAttribute('uuidref') == lov_uuid: fsuuid = fs.getAttribute('uuid') break if not fsuuid: panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.") mtpts = db.root_node.getElementsByTagName('mountpoint') lov_name = None for fs in mtpts: ref = fs.getElementsByTagName('filesystem_ref') if ref[0].getAttribute('uuidref') == fsuuid: lov_name = fs.getAttribute('name') break if not lov_name: panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.") print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name ost_uuid = rec.getAttribute('ost_uuidref') obd = db.lookup(ost_uuid) if not obd: panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.") osc = get_osc(obd, lov_uuid, lov_name) if not osc: panic('osc not found:', obd_uuid) return osc # write logs for update records. sadly, logs of all types -- and updates in # particular -- are something of an afterthought. lconf needs rewritten with # these as core concepts. so this is a pretty big hack. def process_update_record(db, update, lov): for rec in update.childNodes: if rec.nodeType != rec.ELEMENT_NODE: continue log("found "+rec.nodeName+" record in update version " + str(update.getAttribute('version'))) lov_uuid = rec.getAttribute('lov_uuidref') ost_uuid = rec.getAttribute('ost_uuidref') index = rec.getAttribute('index') gen = rec.getAttribute('generation') if not lov_uuid or not ost_uuid or not index or not gen: panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.") if not lov: tmplov = db.lookup(lov_uuid) if not tmplov: panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.") lov_name = tmplov.getName() else: lov_name = lov.osc.name # ------------------------------------------------------------- add if rec.nodeName == 'add': if config.cleanup: lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen) continue osc = magic_get_osc(db, rec, lov) try: # Only ignore connect failures with --force, which # isn't implemented here yet. osc.prepare(ignore_connect_failure=0) except CommandError, e: print "Error preparing OSC %s\n" % osc.uuid raise e lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen) # ------------------------------------------------------ deactivate elif rec.nodeName == 'deactivate': if config.cleanup: continue osc = magic_get_osc(db, rec, lov) try: osc.deactivate() except CommandError, e: print "Error deactivating OSC %s\n" % osc.uuid raise e # ---------------------------------------------------------- delete elif rec.nodeName == 'delete': if config.cleanup: continue osc = magic_get_osc(db, rec, lov) try: config.cleanup = 1 osc.cleanup() config.cleanup = 0 except CommandError, e: print "Error cleaning up OSC %s\n" % osc.uuid raise e lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen) def process_updates(db, log_device, log_name, lov = None): updates = db.root_node.getElementsByTagName('update') for u in updates: if not u.childNodes: log("ignoring empty update record (version " + str(u.getAttribute('version')) + ")") continue version = u.getAttribute('version') real_name = "%s-%s" % (log_name, version) lctl.clear_log(log_device, real_name) lctl.record(log_device, real_name) process_update_record(db, u, lov) lctl.end_record() def doWriteconf(services): #if config.nosetup: # return for s in services: if s[1].get_class() == 'mdsdev' or s[1].get_class() == 'osd': n = newService(s[1]) n.write_conf() n.cleanup() def doSetup(services): if config.nosetup: return slist = [] for s in services: n = newService(s[1]) n.level = s[0] slist.append((n.level, n)) nlist = [] for n in slist: nl = n[1].correct_level(n[0]) nlist.append((nl, n[1])) nlist.sort() for n in nlist: n[1].prepare() def doLoadModules(services): if config.nomod: return # adding all needed modules from all services for s in services: n = newService(s[1]) n.add_module(mod_manager) # loading all registered modules mod_manager.load_modules() def doUnloadModules(services): if config.nomod: return # adding all needed modules from all services for s in services: n = newService(s[1]) if n.safe_to_clean_modules(): n.add_module(mod_manager) # unloading all registered modules mod_manager.cleanup_modules() def doCleanup(services): if config.nosetup: return slist = [] for s in services: n = newService(s[1]) n.level = s[0] slist.append((n.level, n)) nlist = [] for n in slist: nl = n[1].correct_level(n[0]) nlist.append((nl, n[1])) nlist.sort() nlist.reverse() for n in nlist: if n[1].safe_to_clean(): n[1].cleanup() # # Load profile for def doHost(lustreDB, hosts): global is_router, local_node_name node_db = None for h in hosts: node_db = lustreDB.lookup_name(h, 'node') if node_db: break if not node_db: panic('No host entry found.') local_node_name = node_db.get_val('name', 0) is_router = node_db.get_val_int('router', 0) lustre_upcall = node_db.get_val('lustreUpcall', '') portals_upcall = node_db.get_val('portalsUpcall', '') timeout = node_db.get_val_int('timeout', 0) ptldebug = node_db.get_val('ptldebug', '') subsystem = node_db.get_val('subsystem', '') find_local_clusters(node_db) if not is_router: find_local_routes(lustreDB) # Two step process: (1) load modules, (2) setup lustre # if not cleaning, load modules first. prof_list = node_db.get_refs('profile') if config.write_conf: for_each_profile(node_db, prof_list, doLoadModules) sys_make_devices() for_each_profile(node_db, prof_list, doWriteconf) for_each_profile(node_db, prof_list, doUnloadModules) lustreDB.close() elif config.recover: if not (config.tgt_uuid and config.client_uuid and config.conn_uuid): raise Lustre.LconfError( "--recovery requires --tgt_uuid " + "--client_uuid --conn_uuid ") doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid, config.conn_uuid) elif config.cleanup: if config.force: # the command line can override this value timeout = 5 # ugly hack, only need to run lctl commands for --dump if config.lctl_dump or config.record: for_each_profile(node_db, prof_list, doCleanup) return sys_set_timeout(timeout) sys_set_ptldebug(ptldebug) sys_set_subsystem(subsystem) sys_set_lustre_upcall(lustre_upcall) sys_set_portals_upcall(portals_upcall) for_each_profile(node_db, prof_list, doCleanup) for_each_profile(node_db, prof_list, doUnloadModules) lustreDB.close() else: # ugly hack, only need to run lctl commands for --dump if config.lctl_dump or config.record: sys_set_timeout(timeout) sys_set_lustre_upcall(lustre_upcall) for_each_profile(node_db, prof_list, doSetup) return sys_make_devices() sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) for_each_profile(node_db, prof_list, doLoadModules) sys_set_debug_path() sys_set_ptldebug(ptldebug) sys_set_subsystem(subsystem) script = config.gdb_script run(lctl.lctl, ' modules >', script) if config.gdb: log ("The GDB module script is in", script) # pause, so user has time to break and # load the script time.sleep(5) sys_set_timeout(timeout) sys_set_lustre_upcall(lustre_upcall) sys_set_portals_upcall(portals_upcall) for_each_profile(node_db, prof_list, doSetup) lustreDB.close() def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid): tgt = lustreDB.lookup(tgt_uuid) if not tgt: raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.") new_uuid = get_active_target(tgt) if not new_uuid: raise Lustre.LconfError("doRecovery: no active target found for: " + tgt_uuid) net = choose_local_server(get_ost_net(lustreDB, new_uuid)) if not net: raise Lustre.LconfError("Unable to find a connection to:" + new_uuid) log("Reconnecting", tgt_uuid, " to ", net.nid_uuid); try: oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid) lustreDB.close() if oldnet: lctl.disconnect(oldnet) except CommandError, e: log("recover: disconnect", nid_uuid, "failed: ") e.dump() try: lctl.connect(net) except CommandError, e: log("recover: connect failed") e.dump() lctl.recover(client_uuid, net.nid_uuid) def setupModulePath(cmd, portals_dir = PORTALS_DIR): base = os.path.dirname(cmd) if development_mode(): if not config.lustre: debug('using objdir module paths') config.lustre = (os.path.join(base, "..")) # normalize the portals dir, using command line arg if set if config.portals: portals_dir = config.portals dir = os.path.join(config.lustre, portals_dir) config.portals = dir debug('config.portals', config.portals) elif config.lustre and config.portals: # production mode # if --lustre and --portals, normalize portals # can ignore POTRALS_DIR here, since it is probly useless here config.portals = os.path.join(config.lustre, config.portals) debug('config.portals B', config.portals) def sysctl(path, val): debug("+ sysctl", path, val) if config.noexec: return try: fp = open(os.path.join('/proc/sys', path), 'w') fp.write(str(val)) fp.close() except IOError, e: panic(str(e)) def sys_set_debug_path(): sysctl('portals/debug_path', config.debug_path) def sys_set_lustre_upcall(upcall): # the command overrides the value in the node config if config.lustre_upcall: upcall = config.lustre_upcall elif config.upcall: upcall = config.upcall if upcall: lctl.set_lustre_upcall(upcall) def sys_set_portals_upcall(upcall): # the command overrides the value in the node config if config.portals_upcall: upcall = config.portals_upcall elif config.upcall: upcall = config.upcall if upcall: sysctl('portals/upcall', upcall) def sys_set_timeout(timeout): # the command overrides the value in the node config if config.timeout and config.timeout > 0: timeout = config.timeout if timeout != None and timeout > 0: lctl.set_timeout(timeout) def sys_tweak_socknal (): # reserve at least 8MB, or we run out of RAM in skb_alloc under read if sys_get_branch() == '2.6': fp = open('/proc/meminfo') lines = fp.readlines() fp.close() memtotal = 131072 for l in lines: a = string.split(l) if a[0] == 'MemTotal:': memtotal = a[1] debug("memtotal" + memtotal) if int(memtotal) < 262144: minfree = int(memtotal) / 16 else: minfree = 32768 debug("+ minfree ", minfree) sysctl("vm/min_free_kbytes", minfree) if config.single_socket: sysctl("socknal/typed", 0) def sys_optimize_elan (): procfiles = ["/proc/elan/config/eventint_punt_loops", "/proc/qsnet/elan3/config/eventint_punt_loops", "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"] for p in procfiles: if os.access(p, os.W_OK): run ("echo 1 > " + p) def sys_set_ptldebug(ptldebug): if config.ptldebug: ptldebug = config.ptldebug if ptldebug: try: val = eval(ptldebug, ptldebug_names) val = "0x%x" % (id(val) & 0xffffffffL) sysctl('portals/debug', val) except NameError, e: panic(str(e)) def sys_set_subsystem(subsystem): if config.subsystem: subsystem = config.subsystem if subsystem: try: val = eval(subsystem, subsystem_names) val = "0x%x" % (id(val) & 0xffffffffL) sysctl('portals/subsystem_debug', val) except NameError, e: panic(str(e)) def sys_set_netmem_max(path, max): debug("setting", path, "to at least", max) if config.noexec: return fp = open(path) str = fp.readline() fp.close() cur = int(str) if max > cur: fp = open(path, 'w') fp.write('%d\n' %(max)) fp.close() def sys_make_devices(): if not os.access('/dev/portals', os.R_OK): run('mknod /dev/portals c 10 240') if not os.access('/dev/obd', os.R_OK): run('mknod /dev/obd c 10 241') # Add dir to the global PATH, if not already there. def add_to_path(new_dir): syspath = string.split(os.environ['PATH'], ':') if new_dir in syspath: return os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir def default_debug_path(): path = '/tmp/lustre-log' if os.path.isdir('/r'): return '/r' + path else: return path def default_gdb_script(): script = '/tmp/ogdb' if os.path.isdir('/r'): return '/r' + script else: return script DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin') # ensure basic elements are in the system path def sanitise_path(): for dir in DEFAULT_PATH: add_to_path(dir) # global hack for the --select handling tgt_select = {} def init_select(args): # args = [service=nodeA,service2=nodeB service3=nodeC] global tgt_select for arg in args: list = string.split(arg, ',') for entry in list: srv, node = string.split(entry, '=') tgt_select[srv] = node def get_select(srv): if tgt_select.has_key(srv): return tgt_select[srv] return None FLAG = Lustre.Options.FLAG PARAM = Lustre.Options.PARAM INTPARAM = Lustre.Options.INTPARAM PARAMLIST = Lustre.Options.PARAMLIST lconf_options = [ ('verbose,v', "Print system commands as they are run"), ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM), ('config', "Cluster config name used for LDAP query", PARAM), ('select', "service=nodeA,service2=nodeB ", PARAMLIST), ('node', "Load config for ", PARAM), ('sec', "security flavor of client", PARAM), ('mds_mds_sec', "security flavor of inter mds's", PARAM), ('mds_ost_sec', "security flavor of mds's-ost's", PARAM), ('cleanup,d', "Cleans up config. (Shutdown)"), ('force,f', "Forced unmounting and/or obd detach during cleanup", FLAG, 0), ('single_socket', "socknal option: only use one socket instead of bundle", FLAG, 0), ('failover',"""Used to shut down without saving state. This will allow this node to "give up" a service to a another node for failover purposes. This will not be a clean shutdown.""", FLAG, 0), ('gdb', """Prints message after creating gdb module script and sleeps for 5 seconds."""), ('noexec,n', """Prints the commands and steps that will be run for a config without executing them. This can used to check if a config file is doing what it should be doing"""), ('nomod', "Skip load/unload module step."), ('nosetup', "Skip device setup/cleanup step."), ('reformat', "Reformat all devices (without question)"), ('mkfsoptions', "Additional options for the mk*fs command line", PARAM), ('mountfsoptions', "Additional options for mount fs command line", PARAM), ('clientoptions', "Additional options for Lustre", PARAM), ('dump', "Dump the kernel debug log to file before portals is unloaded", PARAM), ('write_conf', "Save all the client config information on mds."), ('record', "Write config information on mds."), ('record_log', "Name of config record log.", PARAM), ('record_device', "MDS device name that will record the config commands", PARAM), ('root_squash', "MDS squash root to appointed uid", PARAM), ('no_root_squash', "Don't squash root for appointed nid", PARAM), ('minlevel', "Minimum level of services to configure/cleanup", INTPARAM, 0), ('maxlevel', """Maximum level of services to configure/cleanup Levels are aproximatly like: 10 - netwrk 20 - device, ldlm 30 - osd, mdd 40 - mds, ost 70 - mountpoint, echo_client, osc, mdc, lov""", INTPARAM, 100), ('lustre', """Base directory of lustre sources. This parameter will cause lconf to load modules from a source tree.""", PARAM), ('portals', """Portals source directory. If this is a relative path, then it is assumed to be relative to lustre. """, PARAM), ('timeout', "Set recovery timeout", INTPARAM), ('upcall', "Set both portals and lustre upcall script", PARAM), ('lustre_upcall', "Set lustre upcall script", PARAM), ('portals_upcall', "Set portals upcall script", PARAM), ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM), ('ptldebug', "Set the portals debug level", PARAM), ('subsystem', "Set the portals debug subsystem", PARAM), ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()), ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()), # Client recovery options ('recover', "Recover a device"), ('group', "The group of devices to configure or cleanup", PARAM), ('tgt_uuid', "The failed target (required for recovery)", PARAM), ('client_uuid', "The failed client (required for recovery)", PARAM), ('conn_uuid', "The failed connection (required for recovery)", PARAM), ('inactive', """The name of an inactive service, to be ignored during mounting (currently OST-only). Can be repeated.""", PARAMLIST), ] def main(): global lctl, config, toplustreDB, CONFIG_FILE, mod_manager # in the upcall this is set to SIG_IGN signal.signal(signal.SIGCHLD, signal.SIG_DFL) cl = Lustre.Options("lconf", "config.xml", lconf_options) try: config, args = cl.parse(sys.argv[1:]) except Lustre.OptionError, e: print e sys.exit(1) setupModulePath(sys.argv[0]) host = socket.gethostname() # the PRNG is normally seeded with time(), which is not so good for starting # time-synchronized clusters input = open('/dev/urandom', 'r') if not input: print 'Unable to open /dev/urandom!' sys.exit(1) seed = input.read(32) input.close() random.seed(seed) sanitise_path() init_select(config.select) if len(args) > 0: # allow config to be fetched via HTTP, but only with python2 if sys.version[0] != '1' and args[0].startswith('http://'): import urllib2 try: config_file = urllib2.urlopen(args[0]) except (urllib2.URLError, socket.error), err: if hasattr(err, 'args'): err = err.args[1] print "Could not access '%s': %s" %(args[0], err) sys.exit(1) elif not os.access(args[0], os.R_OK): print 'File not found or readable:', args[0] sys.exit(1) else: # regular file config_file = open(args[0], 'r') try: dom = xml.dom.minidom.parse(config_file) except Exception: panic("%s does not appear to be a config file." % (args[0])) sys.exit(1) # make sure to die here, even in debug mode. config_file.close() CONFIG_FILE = args[0] lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement) if not config.config: config.config = os.path.basename(args[0])# use full path? if config.config[-4:] == '.xml': config.config = config.config[:-4] elif config.ldapurl: if not config.config: panic("--ldapurl requires --config name") dn = "config=%s,fs=lustre" % (config.config) lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl) elif config.ptldebug or config.subsystem: sys_set_ptldebug(None) sys_set_subsystem(None) sys.exit(0) else: print 'Missing config file or ldap URL.' print 'see lconf --help for command summary' sys.exit(1) toplustreDB = lustreDB ver = lustreDB.get_version() if not ver: panic("No version found in config data, please recreate.") if ver != Lustre.CONFIG_VERSION: panic("Config version", ver, "does not match lconf version", Lustre.CONFIG_VERSION) node_list = [] if config.node: node_list.append(config.node) else: if len(host) > 0: node_list.append(host) node_list.append('localhost') debug("configuring for host: ", node_list) if len(host) > 0: config.debug_path = config.debug_path + '-' + host config.gdb_script = config.gdb_script + '-' + host lctl = LCTLInterface('lctl') if config.lctl_dump: lctl.use_save_file(config.lctl_dump) if config.record: if not (config.record_device and config.record_log): panic("When recording, both --record_log and --record_device must be specified.") lctl.clear_log(config.record_device, config.record_log) lctl.record(config.record_device, config.record_log) # init module manager mod_manager = kmod_manager(config.lustre, config.portals) doHost(lustreDB, node_list) if not config.record: return lctl.end_record() process_updates(lustreDB, config.record_device, config.record_log) if __name__ == "__main__": try: main() except Lustre.LconfError, e: print e # traceback.print_exc(file=sys.stdout) sys.exit(1) except CommandError, e: e.dump() sys.exit(e.rc) if first_cleanup_error: sys.exit(first_cleanup_error)