From 54c561f8cc3dfca3262e5f7db658490b30290dea Mon Sep 17 00:00:00 2001 From: jacob Date: Thu, 1 Apr 2004 04:06:36 +0000 Subject: [PATCH] b=3104 Do not cvs add/rm on one branch, then cvs up -r to another branch, and expect CVS to cope. --- lustre/utils/lconf.in | 2706 ------------------------------------------------- 1 file changed, 2706 deletions(-) delete mode 100755 lustre/utils/lconf.in diff --git a/lustre/utils/lconf.in b/lustre/utils/lconf.in deleted file mode 100755 index 3c7e588..0000000 --- a/lustre/utils/lconf.in +++ /dev/null @@ -1,2706 +0,0 @@ -#!/usr/bin/env python -# -# Copyright (C) 2002-2003 Cluster File Systems, Inc. -# Authors: Robert Read -# Mike Shaver -# This file is part of Lustre, http://www.lustre.org. -# -# Lustre is free software; you can redistribute it and/or -# modify it under the terms of version 2 of the GNU General Public -# License as published by the Free Software Foundation. -# -# Lustre is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with Lustre; if not, write to the Free Software -# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. -# -# lconf - lustre configuration tool -# -# lconf is the main driver script for starting and stopping -# lustre filesystem services. -# -# Based in part on the XML obdctl modifications done by Brian Behlendorf - -import sys, getopt, types -import string, os, stat, popen2, socket, time, random, fcntl, select -import re, exceptions, signal, traceback -import xml.dom.minidom - -if sys.version[0] == '1': - from FCNTL import F_GETFL, F_SETFL -else: - from fcntl import F_GETFL, F_SETFL - -PYMOD_DIR = "@PYMOD_DIR@" - -def development_mode(): - base = os.path.dirname(sys.argv[0]) - if os.access(base+"/Makefile", os.R_OK): - return 1 - return 0 - -if not development_mode(): - sys.path.append(PYMOD_DIR) - -import Lustre - -# Global parameters -MAXTCPBUF = 16777216 -DEFAULT_TCPBUF = 8388608 -DEFAULT_PORT = 988 -# -# Maximum number of devices to search for. -# (the /dev/loop* nodes need to be created beforehand) -MAX_LOOP_DEVICES = 256 -PORTALS_DIR = 'portals' - -# Needed to call lconf --record -CONFIG_FILE = "" - -# Please keep these in sync with the values in portals/kp30.h -ptldebug_names = { - "trace" : (1 << 0), - "inode" : (1 << 1), - "super" : (1 << 2), - "ext2" : (1 << 3), - "malloc" : (1 << 4), - "cache" : (1 << 5), - "info" : (1 << 6), - "ioctl" : (1 << 7), - "blocks" : (1 << 8), - "net" : (1 << 9), - "warning" : (1 << 10), - "buffs" : (1 << 11), - "other" : (1 << 12), - "dentry" : (1 << 13), - "portals" : (1 << 14), - "page" : (1 << 15), - "dlmtrace" : (1 << 16), - "error" : (1 << 17), - "emerg" : (1 << 18), - "ha" : (1 << 19), - "rpctrace" : (1 << 20), - "vfstrace" : (1 << 21), - "reada" : (1 << 22), - } - -subsystem_names = { - "undefined" : (1 << 0), - "mdc" : (1 << 1), - "mds" : (1 << 2), - "osc" : (1 << 3), - "ost" : (1 << 4), - "class" : (1 << 5), - "log" : (1 << 6), - "llite" : (1 << 7), - "rpc" : (1 << 8), - "mgmt" : (1 << 9), - "portals" : (1 << 10), - "socknal" : (1 << 11), - "qswnal" : (1 << 12), - "pinger" : (1 << 13), - "filter" : (1 << 14), - "ptlbd" : (1 << 15), - "echo" : (1 << 16), - "ldlm" : (1 << 17), - "lov" : (1 << 18), - "gmnal" : (1 << 19), - "ptlrouter" : (1 << 20), - "cobd" : (1 << 21), - "ibnal" : (1 << 22), - } - - -first_cleanup_error = 0 -def cleanup_error(rc): - global first_cleanup_error - if not first_cleanup_error: - first_cleanup_error = rc - -# ============================================================ -# debugging and error funcs - -def fixme(msg = "this feature"): - raise Lustre.LconfError, msg + ' not implmemented yet.' - -def panic(*args): - msg = string.join(map(str,args)) - if not config.noexec: - raise Lustre.LconfError(msg) - else: - print "! " + msg - -def log(*args): - msg = string.join(map(str,args)) - print msg - -def logall(msgs): - for s in msgs: - print string.strip(s) - -def debug(*args): - if config.verbose: - msg = string.join(map(str,args)) - print msg - -# ack, python's builtin int() does not support '0x123' syntax. -# eval can do it, although what a hack! -def my_int(s): - try: - if s[0:2] == '0x': - return eval(s, {}, {}) - else: - return int(s) - except SyntaxError, e: - raise ValueError("not a number") - except NameError, e: - raise ValueError("not a number") - -# ============================================================ -# locally defined exceptions -class CommandError (exceptions.Exception): - def __init__(self, cmd_name, cmd_err, rc=None): - self.cmd_name = cmd_name - self.cmd_err = cmd_err - self.rc = rc - - def dump(self): - import types - if type(self.cmd_err) == types.StringType: - if self.rc: - print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err) - else: - print "! %s: %s" % (self.cmd_name, self.cmd_err) - elif type(self.cmd_err) == types.ListType: - if self.rc: - print "! %s (error %d):" % (self.cmd_name, self.rc) - else: - print "! %s:" % (self.cmd_name) - for s in self.cmd_err: - print "> %s" %(string.strip(s)) - else: - print self.cmd_err - - -# ============================================================ -# handle daemons, like the acceptor -class DaemonHandler: - """ Manage starting and stopping a daemon. Assumes daemon manages - it's own pid file. """ - - def __init__(self, cmd): - self.command = cmd - self.path ="" - - def start(self): - if self.running(): - log(self.command, "already running.") - if not self.path: - self.path = find_prog(self.command) - if not self.path: - panic(self.command, "not found.") - ret, out = runcmd(self.path +' '+ self.command_line()) - if ret: - raise CommandError(self.path, out, ret) - - def stop(self): - if self.running(): - pid = self.read_pidfile() - try: - log ("killing process", pid) - os.kill(pid, 15) - #time.sleep(1) # let daemon die - except OSError, e: - log("unable to kill", self.command, e) - if self.running(): - log("unable to kill", self.command) - - def running(self): - pid = self.read_pidfile() - if pid: - try: - os.kill(pid, 0) - except OSError: - self.clean_pidfile() - else: - return 1 - return 0 - - def read_pidfile(self): - try: - fp = open(self.pidfile(), 'r') - pid = int(fp.read()) - fp.close() - return pid - except IOError: - return 0 - - def clean_pidfile(self): - """ Remove a stale pidfile """ - log("removing stale pidfile:", self.pidfile()) - try: - os.unlink(self.pidfile()) - except OSError, e: - log(self.pidfile(), e) - -class AcceptorHandler(DaemonHandler): - def __init__(self, port, net_type, send_mem, recv_mem, irq_aff): - DaemonHandler.__init__(self, "acceptor") - self.port = port - self.flags = '' - self.send_mem = send_mem - self.recv_mem = recv_mem - - if irq_aff: - self.flags = self.flags + ' -i' - - def pidfile(self): - return "/var/run/%s-%d.pid" % (self.command, self.port) - - def command_line(self): - return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port))) - -acceptors = {} - -# start the acceptors -def run_acceptors(): - if config.lctl_dump or config.record: - return - for port in acceptors.keys(): - daemon = acceptors[port] - if not daemon.running(): - daemon.start() - -def run_one_acceptor(port): - if config.lctl_dump or config.record: - return - if acceptors.has_key(port): - daemon = acceptors[port] - if not daemon.running(): - daemon.start() - else: - panic("run_one_acceptor: No acceptor defined for port:", port) - -def stop_acceptor(port): - if acceptors.has_key(port): - daemon = acceptors[port] - if daemon.running(): - daemon.stop() - - -# ============================================================ -# handle lctl interface -class LCTLInterface: - """ - Manage communication with lctl - """ - - def __init__(self, cmd): - """ - Initialize close by finding the lctl binary. - """ - self.lctl = find_prog(cmd) - self.save_file = '' - self.record_device = '' - if not self.lctl: - if config.noexec: - debug('! lctl not found') - self.lctl = 'lctl' - else: - raise CommandError('lctl', "unable to find lctl binary.") - - def use_save_file(self, file): - self.save_file = file - - def record(self, dev_name, logname): - log("Recording log", logname, "on", dev_name) - self.record_device = dev_name - self.record_log = logname - - def end_record(self): - log("End recording log", self.record_log, "on", self.record_device) - self.record_device = None - self.record_log = None - - def set_nonblock(self, fd): - fl = fcntl.fcntl(fd, F_GETFL) - fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY) - - def run(self, cmds): - """ - run lctl - the cmds are written to stdin of lctl - lctl doesn't return errors when run in script mode, so - stderr is checked - should modify command line to accept multiple commands, or - create complex command line options - """ - cmd_line = self.lctl - if self.save_file: - cmds = '\n dump ' + self.save_file + '\n' + cmds - elif self.record_device: - cmds = """ - device $%s - record %s - %s""" % (self.record_device, self.record_log, cmds) - - debug("+", cmd_line, cmds) - if config.noexec: return (0, []) - - child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command - child.tochild.write(cmds + "\n") - child.tochild.close() - - # From "Python Cookbook" from O'Reilly - outfile = child.fromchild - outfd = outfile.fileno() - self.set_nonblock(outfd) - errfile = child.childerr - errfd = errfile.fileno() - self.set_nonblock(errfd) - - outdata = errdata = '' - outeof = erreof = 0 - while 1: - ready = select.select([outfd,errfd],[],[]) # Wait for input - if outfd in ready[0]: - outchunk = outfile.read() - if outchunk == '': outeof = 1 - outdata = outdata + outchunk - if errfd in ready[0]: - errchunk = errfile.read() - if errchunk == '': erreof = 1 - errdata = errdata + errchunk - if outeof and erreof: break - # end of "borrowed" code - - ret = child.wait() - if os.WIFEXITED(ret): - rc = os.WEXITSTATUS(ret) - else: - rc = 0 - if rc or len(errdata): - raise CommandError(self.lctl, errdata, rc) - return rc, outdata - - def runcmd(self, *args): - """ - run lctl using the command line - """ - cmd = string.join(map(str,args)) - debug("+", self.lctl, cmd) - rc, out = run(self.lctl, cmd) - if rc: - raise CommandError(self.lctl, out, rc) - return rc, out - - - def clear_log(self, dev, log): - """ clear an existing log """ - cmds = """ - device $%s - probe - clear_log %s - quit """ % (dev, log) - self.run(cmds) - - def network(self, net, nid): - """ set mynid """ - cmds = """ - network %s - mynid %s - quit """ % (net, nid) - self.run(cmds) - - # create a new connection - def add_uuid(self, net_type, uuid, nid): - cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type) - self.run(cmds) - - def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr, - port, flags): - if net_type in ('tcp',) and not config.lctl_dump: - cmds = """ - network %s - send_mem %d - recv_mem %d - add_autoconn %s %s %d %s - quit""" % (net_type, - send_mem, - recv_mem, - nid, hostaddr, port, flags ) - self.run(cmds) - - def connect(self, srv): - self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid) - if srv.net_type in ('tcp',) and not config.lctl_dump: - flags = 's' - if srv.irq_affinity: - flags = flags + 'i' - self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem, - srv.nid, srv.hostaddr, srv.port, flags) - - # Recover a device - def recover(self, dev_name, new_conn): - cmds = """ - device $%s - recover %s""" %(dev_name, new_conn) - self.run(cmds) - - # add a route to a range - def add_route(self, net, gw, lo, hi): - cmds = """ - network %s - add_route %s %s %s - quit """ % (net, - gw, lo, hi) - try: - self.run(cmds) - except CommandError, e: - log ("ignore: ") - e.dump() - - def del_route(self, net, gw, lo, hi): - cmds = """ - ignore_errors - network %s - del_route %s %s %s - quit """ % (net, gw, lo, hi) - self.run(cmds) - - # add a route to a host - def add_route_host(self, net, uuid, gw, tgt): - self.add_uuid(net, uuid, tgt) - cmds = """ - network %s - add_route %s %s - quit """ % (net, - gw, tgt) - try: - self.run(cmds) - except CommandError, e: - log ("ignore: ") - e.dump() - - # add a route to a range - def del_route_host(self, net, uuid, gw, tgt): - self.del_uuid(uuid) - cmds = """ - ignore_errors - network %s - del_route %s %s - quit """ % (net, gw, tgt) - self.run(cmds) - - - def del_autoconn(self, net_type, nid, hostaddr): - if net_type in ('tcp',) and not config.lctl_dump: - cmds = """ - ignore_errors - network %s - del_autoconn %s %s s - quit""" % (net_type, - nid, hostaddr) - self.run(cmds) - - # disconnect one connection - def disconnect(self, srv): - self.del_uuid(srv.nid_uuid) - if srv.net_type in ('tcp',) and not config.lctl_dump: - self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr) - - def del_uuid(self, uuid): - cmds = """ - ignore_errors - del_uuid %s - quit""" % (uuid,) - self.run(cmds) - - # disconnect all - def disconnectAll(self, net): - cmds = """ - ignore_errors - network %s - disconnect - quit""" % (net) - self.run(cmds) - - def attach(self, type, name, uuid): - cmds = """ - attach %s %s %s - quit""" % (type, name, uuid) - self.run(cmds) - - def setup(self, name, setup = ""): - cmds = """ - cfg_device %s - setup %s - quit""" % (name, setup) - self.run(cmds) - - - # create a new device with lctl - def newdev(self, type, name, uuid, setup = ""): - self.attach(type, name, uuid); - try: - self.setup(name, setup) - except CommandError, e: - self.cleanup(name, uuid, 0) - raise e - - - # cleanup a device - def cleanup(self, name, uuid, force, failover = 0): - if failover: force = 1 - cmds = """ - ignore_errors - cfg_device $%s - cleanup %s %s - detach - quit""" % (name, ('', 'force')[force], - ('', 'failover')[failover]) - self.run(cmds) - - # create an lov - def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt, - stripe_sz, stripe_off, - pattern, devlist): - cmds = """ - attach lov %s %s - lov_setup %s %d %d %d %s %s - quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, - pattern, devlist) - self.run(cmds) - - # create an lov - def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, - pattern, devlist): - cmds = """ - cfg_device $%s - lov_setconfig %s %d %d %d %s %s - quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist) - self.run(cmds) - - # dump the log file - def dump(self, dump_file): - cmds = """ - debug_kernel %s 1 - quit""" % (dump_file) - self.run(cmds) - - # get list of devices - def device_list(self): - devices = '/proc/fs/lustre/devices' - ret = [] - if os.access(devices, os.R_OK): - try: - fp = open(devices, 'r') - ret = fp.readlines() - fp.close() - except IOError, e: - log(e) - return ret - - # get lustre version - def lustre_version(self): - rc, out = self.runcmd('version') - return out - - # dump mount options - def mount_option(self, profile, osc, mdc): - cmds = """ - mount_option %s %s %s - quit""" % (profile, osc, mdc) - self.run(cmds) - - # delete mount options - def del_mount_option(self, profile): - cmds = """ - del_mount_option %s - quit""" % (profile,) - self.run(cmds) - - def set_timeout(self, timeout): - cmds = """ - set_timeout %s - quit""" % (timeout,) - self.run(cmds) - - # delete mount options - def set_lustre_upcall(self, upcall): - cmds = """ - set_lustre_upcall %s - quit""" % (upcall,) - self.run(cmds) -# ============================================================ -# Various system-level functions -# (ideally moved to their own module) - -# Run a command and return the output and status. -# stderr is sent to /dev/null, could use popen3 to -# save it if necessary -def runcmd(cmd): - debug ("+", cmd) - if config.noexec: return (0, []) - f = os.popen(cmd + ' 2>&1') - out = f.readlines() - ret = f.close() - if ret: - ret = ret >> 8 - else: - ret = 0 - return (ret, out) - -def run(*args): - cmd = string.join(map(str,args)) - return runcmd(cmd) - -# Run a command in the background. -def run_daemon(*args): - cmd = string.join(map(str,args)) - debug ("+", cmd) - if config.noexec: return 0 - f = os.popen(cmd + ' 2>&1') - ret = f.close() - if ret: - ret = ret >> 8 - else: - ret = 0 - return ret - -# Determine full path to use for an external command -# searches dirname(argv[0]) first, then PATH -def find_prog(cmd): - syspath = string.split(os.environ['PATH'], ':') - cmdpath = os.path.dirname(sys.argv[0]) - syspath.insert(0, cmdpath); - if config.portals: - syspath.insert(0, os.path.join(config.portals, 'utils/')) - for d in syspath: - prog = os.path.join(d,cmd) - if os.access(prog, os.X_OK): - return prog - return '' - -# Recursively look for file starting at base dir -def do_find_file(base, mod): - fullname = os.path.join(base, mod) - if os.access(fullname, os.R_OK): - return fullname - for d in os.listdir(base): - dir = os.path.join(base,d) - if os.path.isdir(dir): - module = do_find_file(dir, mod) - if module: - return module - -def find_module(src_dir, dev_dir, modname): - modbase = src_dir +'/'+ dev_dir +'/'+ modname - for modext in '.ko', '.o': - module = modbase + modext - try: - if os.access(module, os.R_OK): - return module - except OSError: - pass - return None - -# is the path a block device? -def is_block(path): - s = () - try: - s = os.stat(path) - except OSError: - return 0 - return stat.S_ISBLK(s[stat.ST_MODE]) - -# build fs according to type -# fixme: dangerous -def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1): - block_cnt = '' - jopt = '' - iopt = '' - if devsize: - if devsize < 8000: - panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"% - (dev, devsize)) - # devsize is in 1k, and fs block count is in 4k - block_cnt = devsize/4 - - if fstype in ('ext3', 'extN'): - # ext3 journal size is in megabytes - if jsize == 0: - if devsize == 0: - if not is_block(dev): - ret, out = runcmd("ls -l %s" %dev) - devsize = int(string.split(out[0])[4]) / 1024 - else: - ret, out = runcmd("sfdisk -s %s" %dev) - devsize = int(out[0]) - if devsize > 1024 * 1024: - jsize = ((devsize / 102400) * 4) - if jsize > 400: - jsize = 400 - if jsize: jopt = "-J size=%d" %(jsize,) - if isize: iopt = "-I %d" %(isize,) - mkfs = 'mkfs.ext2 -j -b 4096 ' - if not isblock or config.force: - mkfs = mkfs + ' -F ' - elif fstype == 'reiserfs': - # reiserfs journal size is in blocks - if jsize: jopt = "--journal_size %d" %(jsize,) - mkfs = 'mkreiserfs -ff' - else: - panic('unsupported fs type: ', fstype) - - if config.mkfsoptions != None: - mkfs = mkfs + ' ' + config.mkfsoptions - if mkfsoptions != None: - mkfs = mkfs + ' ' + mkfsoptions - (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt) - if ret: - panic("Unable to build fs:", dev, string.join(out)) - # enable hash tree indexing on fsswe - if fstype in ('ext3', 'extN'): - htree = 'echo "feature FEATURE_C5" | debugfs -w' - (ret, out) = run (htree, dev) - if ret: - panic("Unable to enable htree:", dev) - -# some systems use /dev/loopN, some /dev/loop/N -def loop_base(): - import re - loop = '/dev/loop' - if not os.access(loop + str(0), os.R_OK): - loop = loop + '/' - if not os.access(loop + str(0), os.R_OK): - panic ("can't access loop devices") - return loop - -# find loop device assigned to thefile -def find_loop(file): - loop = loop_base() - for n in xrange(0, MAX_LOOP_DEVICES): - dev = loop + str(n) - if os.access(dev, os.R_OK): - (stat, out) = run('losetup', dev) - if out and stat == 0: - m = re.search(r'\((.*)\)', out[0]) - if m and file == m.group(1): - return dev - else: - break - return '' - -# create file if necessary and assign the first free loop device -def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat): - dev = find_loop(file) - if dev: - print 'WARNING file:', file, 'already mapped to', dev - return dev - if reformat or not os.access(file, os.R_OK | os.W_OK): - if size < 8000: - panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size)) - (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, - file)) - if ret: - panic("Unable to create backing store:", file) - mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0) - - loop = loop_base() - # find next free loop - for n in xrange(0, MAX_LOOP_DEVICES): - dev = loop + str(n) - if os.access(dev, os.R_OK): - (stat, out) = run('losetup', dev) - if stat: - run('losetup', dev, file) - return dev - else: - print "out of loop devices" - return '' - print "out of loop devices" - return '' - -# undo loop assignment -def clean_loop(file): - dev = find_loop(file) - if dev: - ret, out = run('losetup -d', dev) - if ret: - log('unable to clean loop device:', dev, 'for file:', file) - logall(out) - -# determine if dev is formatted as a filesystem -def need_format(fstype, dev): - # FIXME don't know how to implement this - return 0 - -# initialize a block device if needed -def block_dev(dev, size, fstype, reformat, autoformat, journal_size, - inode_size, mkfsoptions): - if config.noexec: return dev - if not is_block(dev): - dev = init_loop(dev, size, fstype, journal_size, inode_size, - mkfsoptions, reformat) - elif reformat or (need_format(fstype, dev) and autoformat == 'yes'): - mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions, - isblock=0) -# else: -# panic("device:", dev, -# "not prepared, and autoformat is not set.\n", -# "Rerun with --reformat option to format ALL filesystems") - - return dev - -def if2addr(iface): - """lookup IP address for an interface""" - rc, out = run("/sbin/ifconfig", iface) - if rc or not out: - return None - addr = string.split(out[1])[1] - ip = string.split(addr, ':')[1] - return ip - -def sys_get_elan_position_file(): - procfiles = ["/proc/elan/device0/position", - "/proc/qsnet/elan4/device0/position", - "/proc/qsnet/elan3/device0/position"] - for p in procfiles: - if os.access(p, os.R_OK): - return p - return "" - -def sys_get_local_nid(net_type, wildcard, cluster_id): - """Return the local nid.""" - local = "" - if sys_get_elan_position_file(): - local = sys_get_local_address('elan', '*', cluster_id) - else: - local = sys_get_local_address(net_type, wildcard, cluster_id) - return local - -def sys_get_local_address(net_type, wildcard, cluster_id): - """Return the local address for the network type.""" - local = "" - if net_type in ('tcp',): - if ':' in wildcard: - iface, star = string.split(wildcard, ':') - local = if2addr(iface) - if not local: - panic ("unable to determine ip for:", wildcard) - else: - host = socket.gethostname() - local = socket.gethostbyname(host) - elif net_type == 'elan': - # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()' - f = sys_get_elan_position_file() - if not f: - panic ("unable to determine local Elan ID") - try: - fp = open(f, 'r') - lines = fp.readlines() - fp.close() - for l in lines: - a = string.split(l) - if a[0] == 'NodeId': - elan_id = a[1] - break - try: - nid = my_int(cluster_id) + my_int(elan_id) - local = "%d" % (nid) - except ValueError, e: - local = elan_id - except IOError, e: - log(e) - elif net_type == 'gm': - fixme("automatic local address for GM") - elif net_type == 'scimac': - scinode="/opt/scali/sbin/scinode" - if os.path.exists(scinode): - (rc,local) = run(scinode) - else: - panic (scinode, " not found on node with scimac networking") - if rc: - panic (scinode, " failed") - local=string.rstrip(local[0]) - - return local - -def mod_loaded(modname): - """Check if a module is already loaded. Look in /proc/modules for it.""" - try: - fp = open('/proc/modules') - lines = fp.readlines() - fp.close() - # please forgive my tired fingers for this one - ret = filter(lambda word, mod=modname: word == mod, - map(lambda line: string.split(line)[0], lines)) - return ret - except Exception, e: - return 0 - -# XXX: instead of device_list, ask for $name and see what we get -def is_prepared(name): - """Return true if a device exists for the name""" - if config.lctl_dump: - return 0 - if (config.noexec or config.record) and config.cleanup: - return 1 - try: - # expect this format: - # 1 UP ldlm ldlm ldlm_UUID 2 - out = lctl.device_list() - for s in out: - if name == string.split(s)[3]: - return 1 - except CommandError, e: - e.dump() - return 0 - -def is_network_prepared(): - """If the any device exists, then assume that all networking - has been configured""" - out = lctl.device_list() - return len(out) > 0 - -def fs_is_mounted(path): - """Return true if path is a mounted lustre filesystem""" - try: - fp = open('/proc/mounts') - lines = fp.readlines() - fp.close() - for l in lines: - a = string.split(l) - if a[1] == path and a[2] == 'lustre_lite': - return 1 - except IOError, e: - log(e) - return 0 - - -class kmod: - """Manage kernel modules""" - def __init__(self, lustre_dir, portals_dir): - self.lustre_dir = lustre_dir - self.portals_dir = portals_dir - self.kmodule_list = [] - - def add_portals_module(self, dev_dir, modname): - """Append a module to list of modules to load.""" - self.kmodule_list.append((self.portals_dir, dev_dir, modname)) - - def add_lustre_module(self, dev_dir, modname): - """Append a module to list of modules to load.""" - self.kmodule_list.append((self.lustre_dir, dev_dir, modname)) - - def load_module(self): - """Load all the modules in the list in the order they appear.""" - for src_dir, dev_dir, mod in self.kmodule_list: - if mod_loaded(mod) and not config.noexec: - continue - log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir) - if src_dir: - module = find_module(src_dir, dev_dir, mod) - if not module: - panic('module not found:', mod) - (rc, out) = run('/sbin/insmod', module) - if rc: - raise CommandError('insmod', out, rc) - else: - (rc, out) = run('/sbin/modprobe', mod) - if rc: - raise CommandError('modprobe', out, rc) - - def cleanup_module(self): - """Unload the modules in the list in reverse order.""" - rev = self.kmodule_list - rev.reverse() - for src_dir, dev_dir, mod in rev: - if not mod_loaded(mod) and not config.noexec: - continue - # debug hack - if mod == 'portals' and config.dump: - lctl.dump(config.dump) - log('unloading module:', mod) - (rc, out) = run('/sbin/rmmod', mod) - if rc: - log('! unable to unload module:', mod) - logall(out) - -# ============================================================ -# Classes to prepare and cleanup the various objects -# -class Module: - """ Base class for the rest of the modules. The default cleanup method is - defined here, as well as some utilitiy funcs. - """ - def __init__(self, module_name, db): - self.db = db - self.module_name = module_name - self.name = self.db.getName() - self.uuid = self.db.getUUID() - self._server = None - self._connected = 0 - self.kmod = kmod(config.lustre, config.portals) - - def info(self, *args): - msg = string.join(map(str,args)) - print self.module_name + ":", self.name, self.uuid, msg - - def cleanup(self): - """ default cleanup, used for most modules """ - self.info() - try: - lctl.cleanup(self.name, self.uuid, config.force) - except CommandError, e: - log(self.module_name, "cleanup failed: ", self.name) - e.dump() - cleanup_error(e.rc) - - def add_portals_module(self, dev_dir, modname): - """Append a module to list of modules to load.""" - self.kmod.add_portals_module(dev_dir, modname) - - def add_lustre_module(self, dev_dir, modname): - """Append a module to list of modules to load.""" - self.kmod.add_lustre_module(dev_dir, modname) - - def load_module(self): - """Load all the modules in the list in the order they appear.""" - self.kmod.load_module() - - def cleanup_module(self): - """Unload the modules in the list in reverse order.""" - if self.safe_to_clean(): - self.kmod.cleanup_module() - - def safe_to_clean(self): - return 1 - - def safe_to_clean_modules(self): - return self.safe_to_clean() - -class Network(Module): - def __init__(self,db): - Module.__init__(self, 'NETWORK', db) - self.net_type = self.db.get_val('nettype') - self.nid = self.db.get_val('nid', '*') - self.cluster_id = self.db.get_val('clusterid', "0") - self.port = self.db.get_val_int('port', 0) - self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF) - self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF) - self.irq_affinity = self.db.get_val_int('irqaffinity', 0) - - if '*' in self.nid: - self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id) - if not self.nid: - panic("unable to set nid for", self.net_type, self.nid, cluster_id) - self.generic_nid = 1 - debug("nid:", self.nid) - else: - self.generic_nid = 0 - - self.nid_uuid = self.nid_to_uuid(self.nid) - - self.hostaddr = self.db.get_val('hostaddr', self.nid) - if '*' in self.hostaddr: - self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id) - if not self.hostaddr: - panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id) - debug("hostaddr:", self.hostaddr) - - self.add_portals_module("libcfs", 'libcfs') - self.add_portals_module("portals", 'portals') - if node_needs_router(): - self.add_portals_module("router", 'kptlrouter') - if self.net_type == 'tcp': - self.add_portals_module("knals/socknal", 'ksocknal') - if self.net_type == 'elan': - self.add_portals_module("knals/qswnal", 'kqswnal') - if self.net_type == 'gm': - self.add_portals_module("knals/gmnal", 'kgmnal') - if self.net_type == 'scimac': - self.add_portals_module("knals/scimacnal", 'kscimacnal') - - def nid_to_uuid(self, nid): - return "NID_%s_UUID" %(nid,) - - def prepare(self): - if is_network_prepared(): - return - self.info(self.net_type, self.nid, self.port) - if not (config.record and self.generic_nid): - lctl.network(self.net_type, self.nid) - if self.net_type == 'tcp': - sys_tweak_socknal() - if self.net_type == 'elan': - sys_optimize_elan() - if self.port and node_is_router(): - run_one_acceptor(self.port) - self.connect_peer_gateways() - - def connect_peer_gateways(self): - for router in self.db.lookup_class('node'): - if router.get_val_int('router', 0): - for netuuid in router.get_networks(): - net = self.db.lookup(netuuid) - gw = Network(net) - if (gw.cluster_id == self.cluster_id and - gw.net_type == self.net_type): - if gw.nid != self.nid: - lctl.connect(gw) - - def disconnect_peer_gateways(self): - for router in self.db.lookup_class('node'): - if router.get_val_int('router', 0): - for netuuid in router.get_networks(): - net = self.db.lookup(netuuid) - gw = Network(net) - if (gw.cluster_id == self.cluster_id and - gw.net_type == self.net_type): - if gw.nid != self.nid: - try: - lctl.disconnect(gw) - except CommandError, e: - print "disconnect failed: ", self.name - e.dump() - cleanup_error(e.rc) - - def safe_to_clean(self): - return not is_network_prepared() - - def cleanup(self): - self.info(self.net_type, self.nid, self.port) - if self.port: - stop_acceptor(self.port) - if node_is_router(): - self.disconnect_peer_gateways() - -class RouteTable(Module): - def __init__(self,db): - Module.__init__(self, 'ROUTES', db) - - def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, - lo, hi): - # only setup connections for tcp NALs - srvdb = None - if not net_type in ('tcp',): - return None - - # connect to target if route is to single node and this node is the gw - if lo == hi and local_interface(net_type, gw_cluster_id, gw): - if not local_cluster(net_type, tgt_cluster_id): - panic("target", lo, " not on the local cluster") - srvdb = self.db.nid2server(lo, net_type, gw_cluster_id) - # connect to gateway if this node is not the gw - elif (local_cluster(net_type, gw_cluster_id) - and not local_interface(net_type, gw_cluster_id, gw)): - srvdb = self.db.nid2server(gw, net_type, gw_cluster_id) - else: - return None - - if not srvdb: - panic("no server for nid", lo) - return None - - return Network(srvdb) - - def prepare(self): - if is_network_prepared(): - return - self.info() - for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): - lctl.add_route(net_type, gw, lo, hi) - srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi) - if srv: - lctl.connect(srv) - - def safe_to_clean(self): - return not is_network_prepared() - - def cleanup(self): - if is_network_prepared(): - # the network is still being used, don't clean it up - return - for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl(): - srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi) - if srv: - try: - lctl.disconnect(srv) - except CommandError, e: - print "disconnect failed: ", self.name - e.dump() - cleanup_error(e.rc) - - try: - lctl.del_route(net_type, gw, lo, hi) - except CommandError, e: - print "del_route failed: ", self.name - e.dump() - cleanup_error(e.rc) - -class Management(Module): - def __init__(self, db): - Module.__init__(self, 'MGMT', db) - self.add_lustre_module('lvfs', 'lvfs') - self.add_lustre_module('obdclass', 'obdclass') - self.add_lustre_module('ptlrpc', 'ptlrpc') - self.add_lustre_module('mgmt', 'mgmt_svc') - - def prepare(self): - if is_prepared(self.name): - return - self.info() - lctl.newdev("mgmt", self.name, self.uuid) - - def safe_to_clean(self): - return 1 - - def cleanup(self): - if is_prepared(self.name): - Module.cleanup(self) - -# This is only needed to load the modules; the LDLM device -# is now created automatically. -class LDLM(Module): - def __init__(self,db): - Module.__init__(self, 'LDLM', db) - self.add_lustre_module('lvfs', 'lvfs') - self.add_lustre_module('obdclass', 'obdclass') - self.add_lustre_module('ptlrpc', 'ptlrpc') - - def prepare(self): - return - - def cleanup(self): - return - -class LOV(Module): - def __init__(self, db, uuid, fs_name, name_override = None, config_only = None): - Module.__init__(self, 'LOV', db) - if name_override != None: - self.name = "lov_%s" % name_override - self.add_lustre_module('lov', 'lov') - self.mds_uuid = self.db.get_first_ref('mds') - self.stripe_sz = self.db.get_val_int('stripesize', 65536) - self.stripe_off = self.db.get_val_int('stripeoffset', 0) - self.pattern = self.db.get_val_int('stripepattern', 0) - self.devlist = self.db.get_refs('obd') - self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist)) - self.osclist = [] - self.desc_uuid = self.uuid - self.uuid = generate_client_uuid(self.name) - self.fs_name = fs_name - if config_only: - self.config_only = 1 - return - self.config_only = None - mds= self.db.lookup(self.mds_uuid) - self.mds_name = mds.getName() - for obd_uuid in self.devlist: - obd = self.db.lookup(obd_uuid) - osc = get_osc(obd, self.uuid, fs_name) - if osc: - self.osclist.append(osc) - else: - panic('osc not found:', obd_uuid) - - def prepare(self): - if is_prepared(self.name): - return - if self.config_only: - panic("Can't prepare config_only LOV ", self.name) - - for osc in self.osclist: - try: - # Only ignore connect failures with --force, which - # isn't implemented here yet. - osc.prepare(ignore_connect_failure=0) - except CommandError, e: - print "Error preparing OSC %s\n" % osc.uuid - raise e - self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz, - self.stripe_off, self.pattern, self.devlist, self.mds_name) - lctl.lov_setup(self.name, self.uuid, - self.desc_uuid, self.mds_name, self.stripe_cnt, - self.stripe_sz, self.stripe_off, self.pattern, - string.join(self.devlist)) - - def cleanup(self): - if is_prepared(self.name): - Module.cleanup(self) - if self.config_only: - panic("Can't clean up config_only LOV ", self.name) - for osc in self.osclist: - osc.cleanup() - - def load_module(self): - if self.config_only: - panic("Can't load modules for config_only LOV ", self.name) - for osc in self.osclist: - osc.load_module() - break - Module.load_module(self) - - def cleanup_module(self): - if self.config_only: - panic("Can't cleanup modules for config_only LOV ", self.name) - Module.cleanup_module(self) - for osc in self.osclist: - osc.cleanup_module() - break - -class MDSDEV(Module): - def __init__(self,db): - Module.__init__(self, 'MDSDEV', db) - self.devpath = self.db.get_val('devpath','') - self.size = self.db.get_val_int('devsize', 0) - self.journal_size = self.db.get_val_int('journalsize', 0) - self.fstype = self.db.get_val('fstype', '') - self.nspath = self.db.get_val('nspath', '') - self.mkfsoptions = self.db.get_val('mkfsoptions', '') - # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid - target_uuid = self.db.get_first_ref('target') - mds = self.db.lookup(target_uuid) - self.name = mds.getName() - self.filesystem_uuids = mds.get_refs('filesystem') - # FIXME: if fstype not set, then determine based on kernel version - self.format = self.db.get_val('autoformat', "no") - if mds.get_val('failover', 0): - self.failover_mds = 'f' - else: - self.failover_mds = 'n' - active_uuid = get_active_target(mds) - if not active_uuid: - panic("No target device found:", target_uuid) - if active_uuid == self.uuid: - self.active = 1 - else: - self.active = 0 - if self.active and config.group and config.group != mds.get_val('group'): - self.active = 0 - - self.inode_size = self.db.get_val_int('inodesize', 0) - if self.inode_size == 0: - # find the LOV for this MDS - lovconfig_uuid = mds.get_first_ref('lovconfig') - if not lovconfig_uuid: - panic("No LOV config found for MDS ", mds.name) - lovconfig = mds.lookup(lovconfig_uuid) - lov_uuid = lovconfig.get_first_ref('lov') - if not lov_uuid: - panic("No LOV found for lovconfig ", lovconfig.name) - lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1) - - # default stripe count controls default inode_size - stripe_count = lov.stripe_cnt - if stripe_count > 77: - self.inode_size = 4096 - elif stripe_count > 35: - self.inode_size = 2048 - elif stripe_count > 13: - self.inode_size = 1024 - elif stripe_count > 3: - self.inode_size = 512 - else: - self.inode_size = 256 - - self.target_dev_uuid = self.uuid - self.uuid = target_uuid - # modules - self.add_lustre_module('mdc', 'mdc') - self.add_lustre_module('osc', 'osc') - self.add_lustre_module('lov', 'lov') - self.add_lustre_module('mds', 'mds') - if self.fstype: - self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype)) - - def load_module(self): - if self.active: - Module.load_module(self) - - def prepare(self): - if is_prepared(self.name): - return - if not self.active: - debug(self.uuid, "not active") - return - if config.reformat: - # run write_conf automatically, if --reformat used - self.write_conf() - self.info(self.devpath, self.fstype, self.size, self.format) - run_acceptors() - # never reformat here - blkdev = block_dev(self.devpath, self.size, self.fstype, 0, - self.format, self.journal_size, self.inode_size, - self.mkfsoptions) - if not is_prepared('MDT'): - lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="") - try: - lctl.newdev("mds", self.name, self.uuid, - setup ="%s %s %s" %(blkdev, self.fstype, self.name)) - except CommandError, e: - if e.rc == 2: - panic("MDS is missing the config log. Need to run " + - "lconf --write_conf.") - else: - raise e - - def write_conf(self): - if is_prepared(self.name): - return - self.info(self.devpath, self.fstype, self.format) - blkdev = block_dev(self.devpath, self.size, self.fstype, - config.reformat, self.format, self.journal_size, - self.inode_size, self.mkfsoptions) - lctl.newdev("mds", self.name, self.uuid, - setup ="%s %s" %(blkdev, self.fstype)) - - # record logs for the MDS lov - for uuid in self.filesystem_uuids: - log("recording clients for filesystem:", uuid) - fs = self.db.lookup(uuid) - obd_uuid = fs.get_first_ref('obd') - client_uuid = generate_client_uuid(self.name) - client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name, - self.name) - config.record = 1 - lctl.clear_log(self.name, self.name) - lctl.record(self.name, self.name) - client.prepare() - lctl.mount_option(self.name, client.get_name(), "") - lctl.end_record() - - config.cleanup = 1 - lctl.clear_log(self.name, self.name + '-clean') - lctl.record(self.name, self.name + '-clean') - client.cleanup() - lctl.del_mount_option(self.name) - lctl.end_record() - config.cleanup = 0 - config.record = 0 - - # record logs for each client - if config.ldapurl: - config_options = "--ldapurl " + config.ldapurl + " --config " + config.config - else: - config_options = CONFIG_FILE - - for node_db in self.db.lookup_class('node'): - client_name = node_db.getName() - for prof_uuid in node_db.get_refs('profile'): - prof_db = node_db.lookup(prof_uuid) - # refactor this into a funtion to test "clientness" - # of a node. - for ref_class, ref_uuid in prof_db.get_all_refs(): - if ref_class in ('mountpoint','echoclient'): - debug("recording", client_name) - old_noexec = config.noexec - config.noexec = 0 - noexec_opt = ('', '-n') - ret, out = run (sys.argv[0], - noexec_opt[old_noexec == 1], - " -v --record --nomod", - "--record_log", client_name, - "--record_device", self.name, - "--node", client_name, - config_options) - if config.verbose: - for s in out: log("record> ", string.strip(s)) - ret, out = run (sys.argv[0], - noexec_opt[old_noexec == 1], - "--cleanup -v --record --nomod", - "--record_log", client_name + "-clean", - "--record_device", self.name, - "--node", client_name, - config_options) - if config.verbose: - for s in out: log("record> ", string.strip(s)) - config.noexec = old_noexec - try: - lctl.cleanup(self.name, self.uuid, 0, 0) - except CommandError, e: - log(self.module_name, "cleanup failed: ", self.name) - e.dump() - cleanup_error(e.rc) - Module.cleanup(self) - clean_loop(self.devpath) - - def msd_remaining(self): - out = lctl.device_list() - for s in out: - if string.split(s)[2] in ('mds',): - return 1 - - def safe_to_clean(self): - return self.active - - def safe_to_clean_modules(self): - return not self.msd_remaining() - - def cleanup(self): - if not self.active: - debug(self.uuid, "not active") - return - self.info() - if is_prepared(self.name): - try: - lctl.cleanup(self.name, self.uuid, config.force, - config.failover) - except CommandError, e: - log(self.module_name, "cleanup failed: ", self.name) - e.dump() - cleanup_error(e.rc) - Module.cleanup(self) - if not self.msd_remaining() and is_prepared('MDT'): - try: - lctl.cleanup("MDT", "MDT_UUID", config.force, - config.failover) - except CommandError, e: - print "cleanup failed: ", self.name - e.dump() - cleanup_error(e.rc) - clean_loop(self.devpath) - -class OSD(Module): - def __init__(self, db): - Module.__init__(self, 'OSD', db) - self.osdtype = self.db.get_val('osdtype') - self.devpath = self.db.get_val('devpath', '') - self.size = self.db.get_val_int('devsize', 0) - self.journal_size = self.db.get_val_int('journalsize', 0) - self.inode_size = self.db.get_val_int('inodesize', 0) - self.mkfsoptions = self.db.get_val('mkfsoptions', '') - self.fstype = self.db.get_val('fstype', '') - self.nspath = self.db.get_val('nspath', '') - target_uuid = self.db.get_first_ref('target') - ost = self.db.lookup(target_uuid) - self.name = ost.getName() - self.format = self.db.get_val('autoformat', 'yes') - if ost.get_val('failover', 0): - self.failover_ost = 'f' - else: - self.failover_ost = 'n' - - active_uuid = get_active_target(ost) - if not active_uuid: - panic("No target device found:", target_uuid) - if active_uuid == self.uuid: - self.active = 1 - else: - self.active = 0 - if self.active and config.group and config.group != ost.get_val('group'): - self.active = 0 - - self.target_dev_uuid = self.uuid - self.uuid = target_uuid - # modules - self.add_lustre_module('ost', 'ost') - # FIXME: should we default to ext3 here? - if self.fstype: - self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype)) - self.add_lustre_module(self.osdtype, self.osdtype) - - def load_module(self): - if self.active: - Module.load_module(self) - - # need to check /proc/mounts and /etc/mtab before - # formatting anything. - # FIXME: check if device is already formatted. - def prepare(self): - if is_prepared(self.name): - return - if not self.active: - debug(self.uuid, "not active") - return - self.info(self.osdtype, self.devpath, self.size, self.fstype, - self.format, self.journal_size, self.inode_size) - run_acceptors() - if self.osdtype == 'obdecho': - blkdev = '' - else: - blkdev = block_dev(self.devpath, self.size, self.fstype, - config.reformat, self.format, self.journal_size, - self.inode_size, self.mkfsoptions) - lctl.newdev(self.osdtype, self.name, self.uuid, - setup ="%s %s %s" %(blkdev, self.fstype, - self.failover_ost)) - if not is_prepared('OSS'): - lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="") - - def osd_remaining(self): - out = lctl.device_list() - for s in out: - if string.split(s)[2] in ('obdfilter', 'obdecho'): - return 1 - - def safe_to_clean(self): - return self.active - - def safe_to_clean_modules(self): - return not self.osd_remaining() - - def cleanup(self): - if not self.active: - debug(self.uuid, "not active") - return - if is_prepared(self.name): - self.info() - try: - lctl.cleanup(self.name, self.uuid, config.force, - config.failover) - except CommandError, e: - log(self.module_name, "cleanup failed: ", self.name) - e.dump() - cleanup_error(e.rc) - if not self.osd_remaining() and is_prepared('OSS'): - try: - lctl.cleanup("OSS", "OSS_UUID", config.force, - config.failover) - except CommandError, e: - print "cleanup failed: ", self.name - e.dump() - cleanup_error(e.rc) - if not self.osdtype == 'obdecho': - clean_loop(self.devpath) - -def mgmt_uuid_for_fs(mtpt_name): - if not mtpt_name: - return '' - mtpt_db = toplevel.lookup_name(mtpt_name) - fs_uuid = mtpt_db.get_first_ref('filesystem') - fs = toplevel.lookup(fs_uuid) - if not fs: - return '' - return fs.get_first_ref('mgmt') - -# Generic client module, used by OSC and MDC -class Client(Module): - def __init__(self, tgtdb, uuid, module, fs_name, self_name=None, - module_dir=None): - self.target_name = tgtdb.getName() - self.target_uuid = tgtdb.getUUID() - self.db = tgtdb - - self.tgt_dev_uuid = get_active_target(tgtdb) - if not self.tgt_dev_uuid: - panic("No target device found for target:", self.target_name) - - self.kmod = kmod(config.lustre, config.portals) - self._server = None - self._connected = 0 - - self.module = module - self.module_name = string.upper(module) - if not self_name: - self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(), - self.target_name, fs_name) - else: - self.name = self_name - self.uuid = uuid - self.lookup_server(self.tgt_dev_uuid) - mgmt_uuid = mgmt_uuid_for_fs(fs_name) - if mgmt_uuid: - self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid) - else: - self.mgmt_name = '' - self.fs_name = fs_name - if not module_dir: - module_dir = module - self.add_lustre_module(module_dir, module) - - def lookup_server(self, srv_uuid): - """ Lookup a server's network information """ - self._server_nets = get_ost_net(self.db, srv_uuid) - if len(self._server_nets) == 0: - panic ("Unable to find a server for:", srv_uuid) - - def get_servers(self): - return self._server_nets - - def prepare(self, ignore_connect_failure = 0): - self.info(self.target_uuid) - if is_prepared(self.name): - self.cleanup() - try: - srv = choose_local_server(self.get_servers()) - if srv: - lctl.connect(srv) - else: - routes = find_route(self.get_servers()) - if len(routes) == 0: - panic ("no route to", self.target_uuid) - for (srv, r) in routes: - lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3]) - except CommandError, e: - if not ignore_connect_failure: - raise e - if srv: - if self.target_uuid in config.inactive and self.permits_inactive(): - debug("%s inactive" % self.target_uuid) - inactive_p = "inactive" - else: - debug("%s active" % self.target_uuid) - inactive_p = "" - lctl.newdev(self.module, self.name, self.uuid, - setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid, - inactive_p, self.mgmt_name)) - - def cleanup(self): - if is_prepared(self.name): - Module.cleanup(self) - try: - srv = choose_local_server(self.get_servers()) - if srv: - lctl.disconnect(srv) - else: - for (srv, r) in find_route(self.get_servers()): - lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3]) - except CommandError, e: - log(self.module_name, "cleanup failed: ", self.name) - e.dump() - cleanup_error(e.rc) - - -class MDC(Client): - def __init__(self, db, uuid, fs_name): - Client.__init__(self, db, uuid, 'mdc', fs_name) - - def permits_inactive(self): - return 0 - -class OSC(Client): - def __init__(self, db, uuid, fs_name): - Client.__init__(self, db, uuid, 'osc', fs_name) - - def permits_inactive(self): - return 1 - -def mgmtcli_name_for_uuid(uuid): - return 'MGMTCLI_%s' % uuid - -class ManagementClient(Client): - def __init__(self, db, uuid): - Client.__init__(self, db, uuid, 'mgmt_cli', '', - self_name = mgmtcli_name_for_uuid(db.getUUID()), - module_dir = 'mgmt') - -class COBD(Module): - def __init__(self, db): - Module.__init__(self, 'COBD', db) - self.real_uuid = self.db.get_first_ref('realobd') - self.cache_uuid = self.db.get_first_ref('cacheobd') - self.add_lustre_module('cobd' , 'cobd') - - # need to check /proc/mounts and /etc/mtab before - # formatting anything. - # FIXME: check if device is already formatted. - def prepare(self): - if is_prepared(self.name): - return - self.info(self.real_uuid, self.cache_uuid) - lctl.newdev("cobd", self.name, self.uuid, - setup ="%s %s" %(self.real_uuid, self.cache_uuid)) - - -# virtual interface for OSC and LOV -class VOSC(Module): - def __init__(self, db, uuid, fs_name, name_override = None): - Module.__init__(self, 'VOSC', db) - if db.get_class() == 'lov': - self.osc = LOV(db, uuid, fs_name, name_override) - else: - self.osc = get_osc(db, uuid, fs_name) - def get_uuid(self): - return self.osc.uuid - def get_name(self): - return self.osc.name - def prepare(self): - self.osc.prepare() - def cleanup(self): - self.osc.cleanup() - def load_module(self): - self.osc.load_module() - def cleanup_module(self): - self.osc.cleanup_module() - - -class ECHO_CLIENT(Module): - def __init__(self,db): - Module.__init__(self, 'ECHO_CLIENT', db) - self.add_lustre_module('obdecho', 'obdecho') - self.obd_uuid = self.db.get_first_ref('obd') - obd = self.db.lookup(self.obd_uuid) - self.uuid = generate_client_uuid(self.name) - self.osc = VOSC(obd, self.uuid, self.name) - - def prepare(self): - if is_prepared(self.name): - return - run_acceptors() - self.osc.prepare() # XXX This is so cheating. -p - self.info(self.obd_uuid) - - lctl.newdev("echo_client", self.name, self.uuid, - setup = self.osc.get_name()) - - def cleanup(self): - if is_prepared(self.name): - Module.cleanup(self) - self.osc.cleanup() - - def load_module(self): - self.osc.load_module() - Module.load_module(self) - - def cleanup_module(self): - Module.cleanup_module(self) - self.osc.cleanup_module() - - -def generate_client_uuid(name): - client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576), - name, - int(random.random() * 1048576), - int(random.random() * 1048576)) - return client_uuid[:36] - - -class Mountpoint(Module): - def __init__(self,db): - Module.__init__(self, 'MTPT', db) - self.path = self.db.get_val('path') - self.fs_uuid = self.db.get_first_ref('filesystem') - fs = self.db.lookup(self.fs_uuid) - self.mds_uuid = fs.get_first_ref('mds') - self.obd_uuid = fs.get_first_ref('obd') - self.mgmt_uuid = fs.get_first_ref('mgmt') - obd = self.db.lookup(self.obd_uuid) - client_uuid = generate_client_uuid(self.name) - self.vosc = VOSC(obd, client_uuid, self.name) - self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid) - - self.add_lustre_module('mdc', 'mdc') - self.add_lustre_module('llite', 'llite') - if self.mgmt_uuid: - self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid), - client_uuid) - else: - self.mgmtcli = None - - def prepare(self): - if fs_is_mounted(self.path): - log(self.path, "already mounted.") - return - run_acceptors() - if self.mgmtcli: - self.mgmtcli.prepare() - self.vosc.prepare() - self.mdc.prepare() - mdc_name = self.mdc.name - - self.info(self.path, self.mds_uuid, self.obd_uuid) - if config.record or config.lctl_dump: - lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name) - return - cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \ - (self.vosc.get_name(), mdc_name, config.config, self.path) - run("mkdir", self.path) - ret, val = run(cmd) - if ret: - self.mdc.cleanup() - self.vosc.cleanup() - panic("mount failed:", self.path, ":", string.join(val)) - - def cleanup(self): - self.info(self.path, self.mds_uuid,self.obd_uuid) - - if config.record or config.lctl_dump: - lctl.del_mount_option(local_node_name) - else: - if fs_is_mounted(self.path): - if config.force: - (rc, out) = run("umount", "-f", self.path) - else: - (rc, out) = run("umount", self.path) - if rc: - raise CommandError('umount', out, rc) - - if fs_is_mounted(self.path): - panic("fs is still mounted:", self.path) - - self.mdc.cleanup() - self.vosc.cleanup() - if self.mgmtcli: - self.mgmtcli.cleanup() - - def load_module(self): - if self.mgmtcli: - self.mgmtcli.load_module() - self.vosc.load_module() - Module.load_module(self) - - def cleanup_module(self): - Module.cleanup_module(self) - self.vosc.cleanup_module() - if self.mgmtcli: - self.mgmtcli.cleanup_module() - - -# ============================================================ -# misc query functions - -def get_ost_net(self, osd_uuid): - srv_list = [] - if not osd_uuid: - return srv_list - osd = self.lookup(osd_uuid) - node_uuid = osd.get_first_ref('node') - node = self.lookup(node_uuid) - if not node: - panic("unable to find node for osd_uuid:", osd_uuid, - " node_ref:", node_uuid) - for net_uuid in node.get_networks(): - db = node.lookup(net_uuid) - srv_list.append(Network(db)) - return srv_list - - -# the order of iniitailization is based on level. -def getServiceLevel(self): - type = self.get_class() - ret=0; - if type in ('network',): - ret = 5 - elif type in ('routetbl',): - ret = 6 - elif type in ('ldlm',): - ret = 20 - elif type in ('mgmt',): - ret = 25 - elif type in ('osd', 'cobd'): - ret = 30 - elif type in ('mdsdev',): - ret = 40 - elif type in ('mountpoint', 'echoclient'): - ret = 70 - else: - panic("Unknown type: ", type) - - if ret < config.minlevel or ret > config.maxlevel: - ret = 0 - return ret - -# -# return list of services in a profile. list is a list of tuples -# [(level, db_object),] -def getServices(self): - list = [] - for ref_class, ref_uuid in self.get_all_refs(): - servdb = self.lookup(ref_uuid) - if servdb: - level = getServiceLevel(servdb) - if level > 0: - list.append((level, servdb)) - else: - panic('service not found: ' + ref_uuid) - - list.sort() - return list - - -############################################################ -# MDC UUID hack - -# FIXME: clean this mess up! -# -# OSC is no longer in the xml, so we have to fake it. -# this is getting ugly and begging for another refactoring -def get_osc(ost_db, uuid, fs_name): - osc = OSC(ost_db, uuid, fs_name) - return osc - -def get_mdc(db, uuid, fs_name, mds_uuid): - mds_db = db.lookup(mds_uuid); - if not mds_db: - panic("no mds:", mds_uuid) - mdc = MDC(mds_db, uuid, fs_name) - return mdc - -############################################################ -# routing ("rooting") - -# list of (nettype, cluster_id, nid) -local_clusters = [] - -def find_local_clusters(node_db): - global local_clusters - for netuuid in node_db.get_networks(): - net = node_db.lookup(netuuid) - srv = Network(net) - debug("add_local", netuuid) - local_clusters.append((srv.net_type, srv.cluster_id, srv.nid)) - if srv.port > 0: - if acceptors.has_key(srv.port): - panic("duplicate port:", srv.port) - acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type, - srv.send_mem, srv.recv_mem, - srv.irq_affinity) - -# This node is a gateway. -is_router = 0 -def node_is_router(): - return is_router - -# If there are any routers found in the config, then this will be true -# and all nodes will load kptlrouter. -needs_router = 0 -def node_needs_router(): - return needs_router or is_router - -# list of (nettype, gw, tgt_cluster_id, lo, hi) -# Currently, these local routes are only added to kptlrouter route -# table if they are needed to connect to a specific server. This -# should be changed so all available routes are loaded, and the -# ptlrouter can make all the decisions. -local_routes = [] - -def find_local_routes(lustre): - """ Scan the lustre config looking for routers . Build list of - routes. """ - global local_routes, needs_router - local_routes = [] - list = lustre.lookup_class('node') - for router in list: - if router.get_val_int('router', 0): - needs_router = 1 - for (local_type, local_cluster_id, local_nid) in local_clusters: - gw = None - for netuuid in router.get_networks(): - db = router.lookup(netuuid) - if (local_type == db.get_val('nettype') and - local_cluster_id == db.get_val('clusterid')): - gw = db.get_val('nid') - break - if gw: - debug("find_local_routes: gw is", gw) - for route in router.get_local_routes(local_type, gw): - local_routes.append(route) - debug("find_local_routes:", local_routes) - - -def choose_local_server(srv_list): - for srv in srv_list: - if local_cluster(srv.net_type, srv.cluster_id): - return srv - -def local_cluster(net_type, cluster_id): - for cluster in local_clusters: - if net_type == cluster[0] and cluster_id == cluster[1]: - return 1 - return 0 - -def local_interface(net_type, cluster_id, nid): - for cluster in local_clusters: - if (net_type == cluster[0] and cluster_id == cluster[1] - and nid == cluster[2]): - return 1 - return 0 - -def find_route(srv_list): - result = [] - frm_type = local_clusters[0][0] - for srv in srv_list: - debug("find_route: srv:", srv.nid, "type: ", srv.net_type) - to_type = srv.net_type - to = srv.nid - cluster_id = srv.cluster_id - debug ('looking for route to', to_type, to) - for r in local_routes: - debug("find_route: ", r) - if (r[3] <= to and to <= r[4]) and cluster_id == r[2]: - result.append((srv, r)) - return result - -def get_active_target(db): - target_uuid = db.getUUID() - target_name = db.getName() - node_name = get_select(target_name) - if node_name: - tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid) - else: - tgt_dev_uuid = db.get_first_ref('active') - return tgt_dev_uuid - -def get_server_by_nid_uuid(db, nid_uuid): - for n in db.lookup_class("network"): - net = Network(n) - if net.nid_uuid == nid_uuid: - return net - - -############################################################ -# lconf level logic -# Start a service. -def newService(db): - type = db.get_class() - debug('Service:', type, db.getName(), db.getUUID()) - n = None - if type == 'ldlm': - n = LDLM(db) - elif type == 'lov': - n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID") - elif type == 'network': - n = Network(db) - elif type == 'routetbl': - n = RouteTable(db) - elif type == 'osd': - n = OSD(db) - elif type == 'cobd': - n = COBD(db) - elif type == 'mdsdev': - n = MDSDEV(db) - elif type == 'mountpoint': - n = Mountpoint(db) - elif type == 'echoclient': - n = ECHO_CLIENT(db) - elif type == 'mgmt': - n = Management(db) - else: - panic ("unknown service type:", type) - return n - -# -# Prepare the system to run lustre using a particular profile -# in a the configuration. -# * load & the modules -# * setup networking for the current node -# * make sure partitions are in place and prepared -# * initialize devices with lctl -# Levels is important, and needs to be enforced. -def for_each_profile(db, prof_list, operation): - for prof_uuid in prof_list: - prof_db = db.lookup(prof_uuid) - if not prof_db: - panic("profile:", profile, "not found.") - services = getServices(prof_db) - operation(services) - -def doWriteconf(services): - if config.nosetup: - return - for s in services: - if s[1].get_class() == 'mdsdev': - n = newService(s[1]) - n.write_conf() - -def doSetup(services): - if config.nosetup: - return - for s in services: - n = newService(s[1]) - n.prepare() - -def doModules(services): - if config.nomod: - return - for s in services: - n = newService(s[1]) - n.load_module() - -def doCleanup(services): - if config.nosetup: - return - services.reverse() - for s in services: - n = newService(s[1]) - if n.safe_to_clean(): - n.cleanup() - -def doUnloadModules(services): - if config.nomod: - return - services.reverse() - for s in services: - n = newService(s[1]) - if n.safe_to_clean_modules(): - n.cleanup_module() - -# -# Load profile for -def doHost(lustreDB, hosts): - global is_router, local_node_name - node_db = None - for h in hosts: - node_db = lustreDB.lookup_name(h, 'node') - if node_db: - break - if not node_db: - panic('No host entry found.') - - local_node_name = node_db.get_val('name', 0) - is_router = node_db.get_val_int('router', 0) - lustre_upcall = node_db.get_val('lustreUpcall', '') - portals_upcall = node_db.get_val('portalsUpcall', '') - timeout = node_db.get_val_int('timeout', 0) - ptldebug = node_db.get_val('ptldebug', '') - subsystem = node_db.get_val('subsystem', '') - - find_local_clusters(node_db) - if not is_router: - find_local_routes(lustreDB) - - # Two step process: (1) load modules, (2) setup lustre - # if not cleaning, load modules first. - prof_list = node_db.get_refs('profile') - - if config.write_conf: - for_each_profile(node_db, prof_list, doModules) - sys_make_devices() - for_each_profile(node_db, prof_list, doWriteconf) - for_each_profile(node_db, prof_list, doUnloadModules) - - elif config.recover: - if not (config.tgt_uuid and config.client_uuid and config.conn_uuid): - raise Lustre.LconfError( "--recovery requires --tgt_uuid " + - "--client_uuid --conn_uuid ") - doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid, - config.conn_uuid) - elif config.cleanup: - if config.force: - # the command line can override this value - timeout = 5 - # ugly hack, only need to run lctl commands for --dump - if config.lctl_dump or config.record: - for_each_profile(node_db, prof_list, doCleanup) - return - - sys_set_timeout(timeout) - sys_set_ptldebug(ptldebug) - sys_set_subsystem(subsystem) - sys_set_lustre_upcall(lustre_upcall) - sys_set_portals_upcall(portals_upcall) - - for_each_profile(node_db, prof_list, doCleanup) - for_each_profile(node_db, prof_list, doUnloadModules) - - else: - # ugly hack, only need to run lctl commands for --dump - if config.lctl_dump or config.record: - sys_set_timeout(timeout) - sys_set_lustre_upcall(lustre_upcall) - for_each_profile(node_db, prof_list, doSetup) - return - - sys_make_devices() - sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF) - sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF) - - for_each_profile(node_db, prof_list, doModules) - - sys_set_debug_path() - sys_set_ptldebug(ptldebug) - sys_set_subsystem(subsystem) - script = config.gdb_script - run(lctl.lctl, ' modules >', script) - if config.gdb: - log ("The GDB module script is in", script) - # pause, so user has time to break and - # load the script - time.sleep(5) - sys_set_timeout(timeout) - sys_set_lustre_upcall(lustre_upcall) - sys_set_portals_upcall(portals_upcall) - - for_each_profile(node_db, prof_list, doSetup) - -def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid): - tgt = db.lookup(tgt_uuid) - if not tgt: - raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.") - new_uuid = get_active_target(tgt) - if not new_uuid: - raise Lustre.LconfError("doRecovery: no active target found for: " + - tgt_uuid) - net = choose_local_server(get_ost_net(db, new_uuid)) - if not net: - raise Lustre.LconfError("Unable to find a connection to:" + new_uuid) - - log("Reconnecting", tgt_uuid, " to ", net.nid_uuid); - try: - oldnet = get_server_by_nid_uuid(db, nid_uuid) - if oldnet: - lctl.disconnect(oldnet) - except CommandError, e: - log("recover: disconnect", nid_uuid, "failed: ") - e.dump() - - try: - lctl.connect(net) - except CommandError, e: - log("recover: connect failed") - e.dump() - - lctl.recover(client_uuid, net.nid_uuid) - - -def setupModulePath(cmd, portals_dir = PORTALS_DIR): - base = os.path.dirname(cmd) - if development_mode(): - if not config.lustre: - debug('using objdir module paths') - config.lustre = (os.path.join(base, "..")) - # normalize the portals dir, using command line arg if set - if config.portals: - portals_dir = config.portals - dir = os.path.join(config.lustre, portals_dir) - config.portals = dir - debug('config.portals', config.portals) - elif config.lustre and config.portals: - # production mode - # if --lustre and --portals, normalize portals - # can ignore POTRALS_DIR here, since it is probly useless here - config.portals = os.path.join(config.lustre, config.portals) - debug('config.portals B', config.portals) - -def sysctl(path, val): - debug("+ sysctl", path, val) - if config.noexec: - return - try: - fp = open(os.path.join('/proc/sys', path), 'w') - fp.write(str(val)) - fp.close() - except IOError, e: - panic(str(e)) - - -def sys_set_debug_path(): - sysctl('portals/debug_path', config.debug_path) - -def sys_set_lustre_upcall(upcall): - # the command overrides the value in the node config - if config.lustre_upcall: - upcall = config.lustre_upcall - elif config.upcall: - upcall = config.upcall - if upcall: - lctl.set_lustre_upcall(upcall) - -def sys_set_portals_upcall(upcall): - # the command overrides the value in the node config - if config.portals_upcall: - upcall = config.portals_upcall - elif config.upcall: - upcall = config.upcall - if upcall: - sysctl('portals/upcall', upcall) - -def sys_set_timeout(timeout): - # the command overrides the value in the node config - if config.timeout and config.timeout > 0: - timeout = config.timeout - if timeout != None and timeout > 0: - lctl.set_timeout(timeout) - -def sys_tweak_socknal (): - if config.single_socket: - sysctl("socknal/typed", 0) - -def sys_optimize_elan (): - procfiles = ["/proc/elan/config/eventint_punt_loops", - "/proc/qsnet/elan3/config/eventint_punt_loops", - "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"] - for p in procfiles: - if os.access(p, os.R_OK): - run ("echo 0 > " + p) - -def sys_set_ptldebug(ptldebug): - if config.ptldebug: - ptldebug = config.ptldebug - if ptldebug: - try: - val = eval(ptldebug, ptldebug_names) - val = "0x%x" % (val) - sysctl('portals/debug', val) - except NameError, e: - panic(str(e)) - -def sys_set_subsystem(subsystem): - if config.subsystem: - subsystem = config.subsystem - if subsystem: - try: - val = eval(subsystem, subsystem_names) - val = "0x%x" % (val) - sysctl('portals/subsystem_debug', val) - except NameError, e: - panic(str(e)) - -def sys_set_netmem_max(path, max): - debug("setting", path, "to at least", max) - if config.noexec: - return - fp = open(path) - str = fp.readline() - fp.close() - cur = int(str) - if max > cur: - fp = open(path, 'w') - fp.write('%d\n' %(max)) - fp.close() - - -def sys_make_devices(): - if not os.access('/dev/portals', os.R_OK): - run('mknod /dev/portals c 10 240') - if not os.access('/dev/obd', os.R_OK): - run('mknod /dev/obd c 10 241') - - -# Add dir to the global PATH, if not already there. -def add_to_path(new_dir): - syspath = string.split(os.environ['PATH'], ':') - if new_dir in syspath: - return - os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir - -def default_debug_path(): - path = '/tmp/lustre-log' - if os.path.isdir('/r'): - return '/r' + path - else: - return path - -def default_gdb_script(): - script = '/tmp/ogdb' - if os.path.isdir('/r'): - return '/r' + script - else: - return script - - -DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin') -# ensure basic elements are in the system path -def sanitise_path(): - for dir in DEFAULT_PATH: - add_to_path(dir) - -# global hack for the --select handling -tgt_select = {} -def init_select(args): - # args = [service=nodeA,service2=nodeB service3=nodeC] - global tgt_select - for arg in args: - list = string.split(arg, ',') - for entry in list: - srv, node = string.split(entry, '=') - tgt_select[srv] = node - -def get_select(srv): - if tgt_select.has_key(srv): - return tgt_select[srv] - return None - - -FLAG = Lustre.Options.FLAG -PARAM = Lustre.Options.PARAM -INTPARAM = Lustre.Options.INTPARAM -PARAMLIST = Lustre.Options.PARAMLIST -lconf_options = [ - ('verbose,v', "Print system commands as they are run"), - ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM), - ('config', "Cluster config name used for LDAP query", PARAM), - ('select', "service=nodeA,service2=nodeB ", PARAMLIST), - ('node', "Load config for ", PARAM), - ('cleanup,d', "Cleans up config. (Shutdown)"), - ('force,f', "Forced unmounting and/or obd detach during cleanup", - FLAG, 0), - ('single_socket', "socknal option: only use one socket instead of bundle", - FLAG, 0), - ('failover',"""Used to shut down without saving state. - This will allow this node to "give up" a service to a - another node for failover purposes. This will not - be a clean shutdown.""", - FLAG, 0), - ('gdb', """Prints message after creating gdb module script - and sleeps for 5 seconds."""), - ('noexec,n', """Prints the commands and steps that will be run for a - config without executing them. This can used to check if a - config file is doing what it should be doing"""), - ('nomod', "Skip load/unload module step."), - ('nosetup', "Skip device setup/cleanup step."), - ('reformat', "Reformat all devices (without question)"), - ('mkfsoptions', "Additional options for the mk*fs command line", PARAM), - ('dump', "Dump the kernel debug log to file before portals is unloaded", - PARAM), - ('write_conf', "Save all the client config information on mds."), - ('record', "Write config information on mds."), - ('record_log', "Name of config record log.", PARAM), - ('record_device', "MDS device name that will record the config commands", - PARAM), - ('minlevel', "Minimum level of services to configure/cleanup", - INTPARAM, 0), - ('maxlevel', """Maximum level of services to configure/cleanup - Levels are aproximatly like: - 10 - netwrk - 20 - device, ldlm - 30 - osd, mdd - 40 - mds, ost - 70 - mountpoint, echo_client, osc, mdc, lov""", - INTPARAM, 100), - ('lustre', """Base directory of lustre sources. This parameter will - cause lconf to load modules from a source tree.""", PARAM), - ('portals', """Portals source directory. If this is a relative path, - then it is assumed to be relative to lustre. """, PARAM), - ('timeout', "Set recovery timeout", INTPARAM), - ('upcall', "Set both portals and lustre upcall script", PARAM), - ('lustre_upcall', "Set lustre upcall script", PARAM), - ('portals_upcall', "Set portals upcall script", PARAM), - ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM), - ('ptldebug', "Set the portals debug level", PARAM), - ('subsystem', "Set the portals debug subsystem", PARAM), - ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()), - ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()), -# Client recovery options - ('recover', "Recover a device"), - ('group', "The group of devices to configure or cleanup", PARAM), - ('tgt_uuid', "The failed target (required for recovery)", PARAM), - ('client_uuid', "The failed client (required for recovery)", PARAM), - ('conn_uuid', "The failed connection (required for recovery)", PARAM), - - ('inactive', """The name of an inactive service, to be ignored during - mounting (currently OST-only). Can be repeated.""", - PARAMLIST), - ] - -def main(): - global lctl, config, toplevel, CONFIG_FILE - - # in the upcall this is set to SIG_IGN - signal.signal(signal.SIGCHLD, signal.SIG_DFL) - - cl = Lustre.Options("lconf", "config.xml", lconf_options) - try: - config, args = cl.parse(sys.argv[1:]) - except Lustre.OptionError, e: - print e - sys.exit(1) - - setupModulePath(sys.argv[0]) - - host = socket.gethostname() - - # the PRNG is normally seeded with time(), which is not so good for starting - # time-synchronized clusters - input = open('/dev/urandom', 'r') - if not input: - print 'Unable to open /dev/urandom!' - sys.exit(1) - seed = input.read(32) - input.close() - random.seed(seed) - - sanitise_path() - - init_select(config.select) - - if len(args) > 0: - # allow config to be fetched via HTTP, but only with python2 - if sys.version[0] != '1' and args[0].startswith('http://'): - import urllib2 - try: - config_file = urllib2.urlopen(args[0]) - except (urllib2.URLError, socket.error), err: - if hasattr(err, 'args'): - err = err.args[1] - print "Could not access '%s': %s" %(args[0], err) - sys.exit(1) - elif not os.access(args[0], os.R_OK): - print 'File not found or readable:', args[0] - sys.exit(1) - else: - # regular file - config_file = open(args[0], 'r') - try: - dom = xml.dom.minidom.parse(config_file) - except Exception: - panic("%s does not appear to be a config file." % (args[0])) - sys.exit(1) # make sure to die here, even in debug mode. - CONFIG_FILE = args[0] - db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement) - if not config.config: - config.config = os.path.basename(args[0])# use full path? - if config.config[-4:] == '.xml': - config.config = config.config[:-4] - elif config.ldapurl: - if not config.config: - panic("--ldapurl requires --config name") - dn = "config=%s,fs=lustre" % (config.config) - db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl) - elif config.ptldebug or config.subsystem: - sys_set_ptldebug(None) - sys_set_subsystem(None) - sys.exit(0) - else: - print 'Missing config file or ldap URL.' - print 'see lconf --help for command summary' - sys.exit(1) - - toplevel = db - - ver = db.get_version() - if not ver: - panic("No version found in config data, please recreate.") - if ver != Lustre.CONFIG_VERSION: - panic("Config version", ver, "does not match lconf version", - Lustre.CONFIG_VERSION) - - node_list = [] - if config.node: - node_list.append(config.node) - else: - if len(host) > 0: - node_list.append(host) - node_list.append('localhost') - - debug("configuring for host: ", node_list) - - if len(host) > 0: - config.debug_path = config.debug_path + '-' + host - config.gdb_script = config.gdb_script + '-' + host - - lctl = LCTLInterface('lctl') - - if config.lctl_dump: - lctl.use_save_file(config.lctl_dump) - - if config.record: - if not (config.record_device and config.record_log): - panic("When recording, both --record_log and --record_device must be specified.") - lctl.clear_log(config.record_device, config.record_log) - lctl.record(config.record_device, config.record_log) - - doHost(db, node_list) - - if config.record: - lctl.end_record() - -if __name__ == "__main__": - try: - main() - except Lustre.LconfError, e: - print e -# traceback.print_exc(file=sys.stdout) - sys.exit(1) - except CommandError, e: - e.dump() - sys.exit(e.rc) - - if first_cleanup_error: - sys.exit(first_cleanup_error) -- 1.8.3.1