import sys, getopt, types
import string, os, stat, popen2, socket, time, random, fcntl, select
-import re, exceptions
+import re, exceptions, signal
import xml.dom.minidom
if sys.version[0] == '1':
else:
from fcntl import F_GETFL, F_SETFL
+PYMOD_DIR = "/usr/lib/lustre/python"
+
+def development_mode():
+ base = os.path.dirname(sys.argv[0])
+ if os.access(base+"/Makefile.am", os.R_OK):
+ return 1
+ return 0
+
+if not development_mode():
+ sys.path.append(PYMOD_DIR)
+
+import Lustre
+
# Global parameters
MAXTCPBUF = 1048576
DEFAULT_TCPBUF = 1048576
# Maximum number of devices to search for.
# (the /dev/loop* nodes need to be created beforehand)
MAX_LOOP_DEVICES = 256
-PORTALS_DIR = '@PORTALSLOC@'
+PORTALS_DIR = 'portals'
+
+
+# Please keep these uptodate with the values in portals/kp30.h
+ptldebug_names = {
+ "trace" : (1 << 0),
+ "inode" : (1 << 1),
+ "super" : (1 << 2),
+ "ext2" : (1 << 3),
+ "malloc" : (1 << 4),
+ "cache" : (1 << 5),
+ "info" : (1 << 6),
+ "ioctl" : (1 << 7),
+ "blocks" : (1 << 8),
+ "net" : (1 << 9),
+ "warning" : (1 << 10),
+ "buffs" : (1 << 11),
+ "other" : (1 << 12),
+ "dentry" : (1 << 13),
+ "portals" : (1 << 14),
+ "page" : (1 << 15),
+ "dlmtrace" : (1 << 16),
+ "error" : (1 << 17),
+ "emerg" : (1 << 18),
+ "ha" : (1 << 19),
+ "rpctrace" : (1 << 20),
+ "vfstrace" : (1 << 21),
+ }
+
+subsystem_names = {
+ "undefined" : (0 << 24),
+ "mdc" : (1 << 24),
+ "mds" : (2 << 24),
+ "osc" : (3 << 24),
+ "ost" : (4 << 24),
+ "class" : (5 << 24),
+ "obdfs" : (6 << 24),
+ "llite" : (7 << 24),
+ "rpc" : (8 << 24),
+ "ext2obd" : (9 << 24),
+ "portals" : (10 << 24),
+ "socknal" : (11 << 24),
+ "qswnal" : (12 << 24),
+ "pinger" : (13 << 24),
+ "filter" : (14 << 24),
+ "trace" : (15 << 24),
+ "echo" : (16 << 24),
+ "ldlm" : (17 << 24),
+ "lov" : (18 << 24),
+ "gmnal" : (19 << 24),
+ "ptlrouter" : (20 << 24),
+ "cobd" : (21 << 24),
+ "ptlbd" : (22 << 24),
+ }
+
first_cleanup_error = 0
def cleanup_error(rc):
if not first_cleanup_error:
first_cleanup_error = rc
-
-def usage():
- print """usage: lconf config.xml
-
-config.xml Lustre configuration in xml format.
---ldapurl LDAP server URL, eg. ldap://localhost
---config Cluster config name used for LDAP query
---node <nodename> Load config for <nodename>
---select service=nodeA,service2=nodeB U
--d | --cleanup Cleans up config. (Shutdown)
--f | --force Forced unmounting and/or obd detach during cleanup
--v | --verbose Print system commands as they are run
--h | --help Print this help
---gdb Prints message after creating gdb module script
- and sleeps for 5 seconds.
--n | --noexec Prints the commands and steps that will be run for a
- config without executing them. This can used to check if a
- config file is doing what it should be doing. (Implies -v)
---nomod Skip load/unload module step.
---nosetup Skip device setup/cleanup step.
---reformat Reformat all devices (without question)
---dump <file> Dump the kernel debug log before portals is unloaded
---minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
---maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
- Levels are aproximatly like:
- 10 - network
- 20 - device, ldlm
- 30 - osd, mdd
- 40 - mds, ost
- 50 - mdc, osc
- 60 - lov
- 70 - mountpoint, echo_client
---lustre=src_dir Base directory of lustre sources. This parameter will cause lconf
- to load modules from a source tree.
---portals=src_dir Portals source directory. If this is a relative path, then it is
- assumed to be relative to lustre.
-
-"""
- TODO = """
---ldap server LDAP server with lustre config database
---makeldiff Translate xml source to LDIFF
-This are perhaps not needed:
-"""
- sys.exit()
-
-# ============================================================
-# Config parameters, encapsulated in a class
-class Config:
- def __init__(self):
- # flags
- self._noexec = 0
- self._verbose = 0
- self._reformat = 0
- self._cleanup = 0
- self._gdb = 0
- self._nomod = 0
- self._nosetup = 0
- self._force = 0
- # parameters
- self._modules = None
- self._node = None
- self._url = None
- self._gdb_script = '/tmp/ogdb'
- self._debug_path = '/tmp/lustre-log'
- self._dump_file = None
- self._lustre_dir = ''
- self._portals_dir = ''
- self._minlevel = 0
- self._maxlevel = 100
- self._timeout = 0
- self._recovery_upcall = ''
- self._ldapurl = ''
- self._config_name = ''
- self._select = {}
- self._lctl_dump = ''
-
- def verbose(self, flag = None):
- if flag: self._verbose = flag
- return self._verbose
-
- def noexec(self, flag = None):
- if flag: self._noexec = flag
- return self._noexec
-
- def reformat(self, flag = None):
- if flag: self._reformat = flag
- return self._reformat
-
- def cleanup(self, flag = None):
- if flag: self._cleanup = flag
- return self._cleanup
-
- def gdb(self, flag = None):
- if flag: self._gdb = flag
- return self._gdb
-
- def nomod(self, flag = None):
- if flag: self._nomod = flag
- return self._nomod
-
- def nosetup(self, flag = None):
- if flag: self._nosetup = flag
- return self._nosetup
-
- def force(self, flag = None):
- if flag: self._force = flag
- return self._force
-
- def node(self, val = None):
- if val: self._node = val
- return self._node
-
- def gdb_script(self):
- if os.path.isdir('/r'):
- return '/r' + self._gdb_script
- else:
- return self._gdb_script
-
- def debug_path(self):
- if os.path.isdir('/r'):
- return '/r' + self._debug_path
- else:
- return self._debug_path
-
- def dump_file(self, val = None):
- if val: self._dump_file = val
- return self._dump_file
- def minlevel(self, val = None):
- if val: self._minlevel = int(val)
- return self._minlevel
-
- def maxlevel(self, val = None):
- if val: self._maxlevel = int(val)
- return self._maxlevel
-
- def portals_dir(self, val = None):
- if val: self._portals_dir = val
- return self._portals_dir
-
- def lustre_dir(self, val = None):
- if val: self._lustre_dir = val
- return self._lustre_dir
-
- def timeout(self, val = None):
- if val: self._timeout = val
- return self._timeout
-
- def recovery_upcall(self, val = None):
- if val: self._recovery_upcall = val
- return self._recovery_upcall
-
- def ldapurl(self, val = None):
- if val: self._ldapurl = val
- return self._ldapurl
-
- def config_name(self, val = None):
- if val: self._config_name = val
- return self._config_name
-
- def init_select(self, arg):
- # arg = "service=nodeA,service2=nodeB"
- list = string.split(arg, ',')
- for entry in list:
- srv, node = string.split(entry, '=')
- self._select[srv] = node
-
- def select(self, srv):
- if self._select.has_key(srv):
- return self._select[srv]
- return None
-
- def lctl_dump(self, val = None):
- if val: self._lctl_dump = val
- return self._lctl_dump
-
-
-config = Config()
-
# ============================================================
# debugging and error funcs
def fixme(msg = "this feature"):
- raise LconfError, msg + ' not implmemented yet.'
+ raise Lustre.LconfError, msg + ' not implmemented yet.'
def panic(*args):
msg = string.join(map(str,args))
- if not config.noexec():
- raise LconfError(msg)
+ if not config.noexec:
+ raise Lustre.LconfError(msg)
else:
print "! " + msg
print string.strip(s)
def debug(*args):
- if config.verbose():
+ if config.verbose:
msg = string.join(map(str,args))
print msg
+
+# ack, python's builtin int() does not support '0x123' syntax.
+# eval can do it, although what a hack!
+def my_int(s):
+ try:
+ if s[0:2] == '0x':
+ return eval(s, {}, {})
+ else:
+ return int(s)
+ except SyntaxError, e:
+ raise ValueError("not a number")
+ except NameError, e:
+ raise ValueError("not a number")
+
# ============================================================
# locally defined exceptions
class CommandError (exceptions.Exception):
else:
print self.cmd_err
-class LconfError (exceptions.Exception):
- def __init__(self, args):
- self.args = args
-
# ============================================================
# handle daemons, like the acceptor
if not daemon.running():
daemon.start()
+def run_one_acceptor(port):
+ if acceptors.has_key(port):
+ daemon = acceptors[port]
+ if not daemon.running():
+ daemon.start()
+ else:
+ panic("run_one_acceptor: No acceptor defined for port:", port)
+
def stop_acceptor(port):
if acceptors.has_key(port):
daemon = acceptors[port]
self.lctl = find_prog(cmd)
self.save_file = ''
if not self.lctl:
- if config.noexec():
+ if config.noexec:
debug('! lctl not found')
self.lctl = 'lctl'
else:
cmds = '\n dump ' + self.save_file + cmds
debug("+", cmd_line, cmds)
- if config.noexec(): return (0, [])
+ if config.noexec: return (0, [])
child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
child.tochild.write(cmds + "\n")
def network(self, net, nid):
""" initialized network and add "self" """
- # Idea: "mynid" could be used for all network types to add "self," and then
- # this special case would be gone and the "self" hack would be hidden.
- if net in ('tcp', 'toe'):
- cmds = """
+ cmds = """
network %s
mynid %s
quit """ % (net, nid)
- self.run(cmds)
+ self.run(cmds)
# create a new connection
def connect(self, srv):
cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
- if srv.net_type in ('tcp', 'toe') and not config.lctl_dump():
+ if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
flags = ''
if srv.irq_affinity:
flags = flags + 'i'
cmds = cmds + "\n quit"
self.run(cmds)
+
+ # Recover a device
+ def recover(self, dev_uuid, new_conn):
+ cmds = """
+ device %%%s
+ probe
+ recover %s""" %(dev_uuid, new_conn)
+ self.run(cmds)
# add a route to a range
def add_route(self, net, gw, lo, hi):
quit""" % (net, nid, servuuid)
self.run(cmds)
+ def del_uuid(self, servuuid):
+ cmds = """
+ ignore_errors
+ del_uuid %s
+ quit""" % (servuuid,)
+ self.run(cmds)
+
# disconnect all
def disconnectAll(self, net):
cmds = """
self.run(cmds)
# cleanup a device
- def cleanup(self, name, uuid):
+ def cleanup(self, name, uuid, force, failover = 0):
+ if failover: force = 1
cmds = """
ignore_errors
device $%s
- cleanup %s
+ cleanup %s %s
detach
- quit""" % (name, ('', 'force')[config.force()])
+ quit""" % (name, ('', 'force')[force],
+ ('', 'failover')[failover])
self.run(cmds)
# create an lov
- def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
+ def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
+ pattern, devlist):
cmds = """
device $%s
probe
# get list of devices
def device_list(self):
- rc, out = self.runcmd('device_list')
+ try:
+ rc, out = self.runcmd('device_list')
+ except CommandError, e:
+ if config.cleanup:
+ out = []
+ else:
+ raise e
return out
# get lustre version
rc, out = self.runcmd('version')
return out
+ # dump mount options
+ def mount_option(self, option):
+ cmds = """
+ mount_option %s
+ quit""" % (option)
+ self.run(cmds)
# ============================================================
# Various system-level functions
# (ideally moved to their own module)
# save it if necessary
def runcmd(cmd):
debug ("+", cmd)
- if config.noexec(): return (0, [])
+ if config.noexec: return (0, [])
f = os.popen(cmd + ' 2>&1')
out = f.readlines()
ret = f.close()
def run_daemon(*args):
cmd = string.join(map(str,args))
debug ("+", cmd)
- if config.noexec(): return 0
+ if config.noexec: return 0
f = os.popen(cmd + ' 2>&1')
ret = f.close()
if ret:
syspath = string.split(os.environ['PATH'], ':')
cmdpath = os.path.dirname(sys.argv[0])
syspath.insert(0, cmdpath);
- if config.portals_dir():
- syspath.insert(0, os.path.join(config.portals_dir()+'/linux/utils/'))
+ if config.portals:
+ syspath.insert(0, os.path.join(config.portals, 'utils/'))
for d in syspath:
prog = os.path.join(d,cmd)
if os.access(prog, os.X_OK):
# build fs according to type
# fixme: dangerous
-def mkfs(dev, devsize, fstype):
+def mkfs(dev, devsize, fstype,jsize):
block_cnt = ''
+ jopt = ''
if devsize:
+ if devsize < 8000:
+ panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
+ (dev, devsize))
# devsize is in 1k, and fs block count is in 4k
block_cnt = devsize/4
- if(fstype in ('ext3', 'extN')):
+ if fstype in ('ext3', 'extN'):
+ # ext3 journal size is in megabytes
+ if jsize: jopt = "-J size=%d" %(jsize,)
mkfs = 'mkfs.ext2 -j -b 4096 -F '
- elif (fstype == 'reiserfs'):
+ elif fstype == 'reiserfs':
+ # reiserfs journal size is in blocks
+ if jsize: jopt = "--journal_size %d" %(jsize,)
mkfs = 'mkreiserfs -ff'
else:
print 'unsupported fs type: ', fstype
- (ret, out) = run (mkfs, dev, block_cnt)
+ (ret, out) = run (mkfs, jopt, dev, block_cnt)
if ret:
- panic("Unable to build fs:", dev)
+ panic("Unable to build fs:", dev, string.join(out))
# enable hash tree indexing on fsswe
- # FIXME: this check can probably go away on 2.5
- if fstype == 'extN':
+ if fstype in ('ext3', 'extN'):
htree = 'echo "feature FEATURE_C5" | debugfs -w'
(ret, out) = run (htree, dev)
if ret:
dev = loop + str(n)
if os.access(dev, os.R_OK):
(stat, out) = run('losetup', dev)
- if (out and stat == 0):
+ if out and stat == 0:
m = re.search(r'\((.*)\)', out[0])
if m and file == m.group(1):
return dev
return ''
# create file if necessary and assign the first free loop device
-def init_loop(file, size, fstype):
+def init_loop(file, size, fstype, journal_size):
dev = find_loop(file)
if dev:
print 'WARNING file:', file, 'already mapped to', dev
return dev
- if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
+ if config.reformat or not os.access(file, os.R_OK | os.W_OK):
if size < 8000:
- panic(file, "size must be larger than 8MB, currently set to:", size)
+ panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
(ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
file))
if ret:
panic("Unable to create backing store:", file)
+ mkfs(file, size, fstype, journal_size)
loop = loop_base()
# find next free loop
dev = loop + str(n)
if os.access(dev, os.R_OK):
(stat, out) = run('losetup', dev)
- if (stat):
+ if stat:
run('losetup', dev, file)
return dev
else:
return 0
# initialize a block device if needed
-def block_dev(dev, size, fstype, format):
- if config.noexec(): return dev
+def block_dev(dev, size, fstype, format, journal_size):
+ if config.noexec: return dev
if not is_block(dev):
- dev = init_loop(dev, size, fstype)
- if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
- mkfs(dev, size, fstype)
+ dev = init_loop(dev, size, fstype, journal_size)
+ elif config.reformat or (need_format(fstype, dev) and format == 'yes'):
+ mkfs(dev, size, fstype, journal_size)
# else:
# panic("device:", dev,
return ip
def get_local_nid(net_type, wildcard):
- """Return the local nid. First look for an elan interface,
- then use the local address. """
+ """Return the local nid."""
local = ""
if os.access('/proc/elan/device0/position', os.R_OK):
local = get_local_address('elan', '*')
log(e)
elif net_type == 'gm':
fixme("automatic local address for GM")
+ elif net_type == 'scimac':
+ scinode="/opt/scali/sbin/scinode"
+ if os.path.exists(scinode):
+ (rc,local) = run(scinode)
+ else:
+ panic (scinode, " not found on node with scimac networking")
+ if rc:
+ panic (scinode, " failed")
+ local=string.rstrip(local[0])
+
return local
def is_prepared(uuid):
"""Return true if a device exists for the uuid"""
- # expect this format:
- # 1 UP ldlm ldlm ldlm_UUID 2
- if config.lctl_dump():
+ if config.lctl_dump:
return 0
+ if config.noexec and config.cleanup:
+ return 1
try:
+ # expect this format:
+ # 1 UP ldlm ldlm ldlm_UUID 2
out = lctl.device_list()
for s in out:
if uuid == string.split(s)[4]:
e.dump()
return 0
-def is_network_prepared():
- """If the PTLRPC device exists, then assumet that all networking
- has been configured"""
- if config.lctl_dump():
+def is_prepared_name(name):
+ """Return true if a device exists for the name"""
+ if config.lctl_dump:
return 0
+ if config.noexec and config.cleanup:
+ return 1
try:
+ # expect this format:
+ # 1 UP ldlm ldlm ldlm_UUID 2
out = lctl.device_list()
for s in out:
- if 'RPCDEV_UUID' == string.split(s)[4]:
+ if name == string.split(s)[3]:
return 1
except CommandError, e:
e.dump()
return 0
-
+
+def is_network_prepared():
+ """If the LDLM device exists, then assume that all networking
+ has been configured"""
+ return is_prepared('ldlm_UUID')
def fs_is_mounted(path):
"""Return true if path is a mounted lustre filesystem"""
""" default cleanup, used for most modules """
self.info()
try:
- lctl.cleanup(self.name, self.uuid)
+ lctl.cleanup(self.name, self.uuid, config.force)
except CommandError, e:
log(self.module_name, "cleanup failed: ", self.name)
e.dump()
def add_portals_module(self, dev_dir, modname):
"""Append a module to list of modules to load."""
- self.kmodule_list.append((config.portals_dir(), dev_dir, modname))
+ self.kmodule_list.append((config.portals, dev_dir, modname))
def add_lustre_module(self, dev_dir, modname):
"""Append a module to list of modules to load."""
- self.kmodule_list.append((config.lustre_dir(), dev_dir, modname))
+ self.kmodule_list.append((config.lustre, dev_dir, modname))
def mod_loaded(self, modname):
"""Check if a module is already loaded. Look in /proc/modules for it."""
"""Load all the modules in the list in the order they appear."""
for src_dir, dev_dir, mod in self.kmodule_list:
# (rc, out) = run ('/sbin/lsmod | grep -s', mod)
- if self.mod_loaded(mod) and not config.noexec():
+ if self.mod_loaded(mod) and not config.noexec:
continue
- log ('loading module:', mod)
+ log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
if src_dir:
module = find_module(src_dir, dev_dir, mod)
if not module:
def cleanup_module(self):
"""Unload the modules in the list in reverse order."""
+ if not self.safe_to_clean():
+ return
rev = self.kmodule_list
rev.reverse()
for src_dir, dev_dir, mod in rev:
- if not self.mod_loaded(mod):
+ if not self.mod_loaded(mod) and not config.noexec:
continue
# debug hack
- if mod == 'portals' and config.dump_file():
- lctl.dump(config.dump_file())
+ if mod == 'portals' and config.dump:
+ lctl.dump(config.dump)
log('unloading module:', mod)
- if config.noexec():
- continue
(rc, out) = run('/sbin/rmmod', mod)
if rc:
log('! unable to unload module:', mod)
logall(out)
+
+ def safe_to_clean(self):
+ return 1
+
+ def safe_to_clean_modules(self):
+ return self.safe_to_clean()
class Network(Module):
def __init__(self,db):
Module.__init__(self, 'NETWORK', db)
self.net_type = self.db.get_val('nettype')
self.nid = self.db.get_val('nid', '*')
+ self.cluster_id = self.db.get_val('clusterid', "0")
self.port = self.db.get_val_int('port', 0)
self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
self.nid_exchange = self.db.get_val_int('nidexchange', 0)
if '*' in self.nid:
- self.nid = get_local_nid(self.net_type, self.nid)
+ if self.nid_exchange:
+ self.nid = get_local_nid(self.net_type, self.nid)
+ else:
+ self.nid = get_local_address(self.net_type, self.nid)
if not self.nid:
panic("unable to set nid for", self.net_type, self.nid)
debug("nid:", self.nid)
if not self.nid:
panic("unable to set nid for", self.net_type, self.hostaddr)
debug("hostaddr:", self.hostaddr)
- # debug ( "hostaddr ", self.hostaddr, "net_type", self.net_type)
- self.add_portals_module("linux/oslib", 'portals')
+ self.add_portals_module("libcfs", 'portals')
if node_needs_router():
- self.add_portals_module("linux/router", 'kptlrouter')
+ self.add_portals_module("router", 'kptlrouter')
if self.net_type == 'tcp':
- self.add_portals_module("linux/socknal", 'ksocknal')
+ self.add_portals_module("knals/socknal", 'ksocknal')
if self.net_type == 'toe':
- self.add_portals_module("/linux/toenal", 'ktoenal')
+ self.add_portals_module("knals/toenal", 'ktoenal')
if self.net_type == 'elan':
- self.add_portals_module("/linux/rqswnal", 'kqswnal')
+ self.add_portals_module("knals/qswnal", 'kqswnal')
if self.net_type == 'gm':
- self.add_portals_module("/linux/gmnal", 'kgmnal')
- self.add_lustre_module('obdclass', 'obdclass')
+ self.add_portals_module("knals/gmnal", 'kgmnal')
+ if self.net_type == 'scimac':
+ self.add_portals_module("knals/scimacnal", 'kscimacnal')
def prepare(self):
if is_network_prepared():
return
self.info(self.net_type, self.nid, self.port)
lctl.network(self.net_type, self.nid)
+ if self.port and node_is_router():
+ run_one_acceptor(self.port)
+ self.connect_peer_gateways()
+
+ def connect_peer_gateways(self):
+ for router in self.db.lookup_class('node'):
+ if router.get_val_int('router', 0):
+ # if this is a peer with a nid less than mine,
+ # then connect.
+ for netuuid in router.get_networks():
+ net = self.db.lookup(netuuid)
+ gw = Network(net)
+ if (gw.cluster_id == self.cluster_id and
+ gw.net_type == self.net_type):
+ # hack: compare as numbers if possible, this should all
+ # go away once autoconnect is done.
+ # This also conveniently prevents us from connecting to ourself.
+ try:
+ gw_nid = my_int(gw.nid)
+ self_nid = my_int(self.nid)
+ except ValueError, e:
+ print "Error!", str(e)
+ gw_nid = gw.nid
+ self_nid = self.nid
+ if gw_nid < self_nid:
+ lctl.connect(gw)
+
+ def disconnect_peer_gateways(self):
+ for router in self.db.lookup_class('node'):
+ if router.get_val_int('router', 0):
+ # if this is a peer with a nid less than mine,
+ # then connect.
+ if (gw.cluster_id == self.cluster_id and
+ gw.net_type == self.net_type):
+ # hack: compare as numbers if possible, this should all
+ # go away once autoconnect is done.
+ # This also conveniently prevents us from connecting to ourself.
+ try:
+ gw_nid = my_int(gw.nid)
+ self_nid = my_int(self.nid)
+ except ValueError, e:
+ print "Error!", str(e)
+ gw_nid = gw.nid
+ self_nid = self.nid
+ if gw_nid < self_nid:
+ try:
+ lctl.disconnect(router.net_type, router.nid, router.port,
+ router.uuid)
+ except CommandError, e:
+ print "disconnectAll failed: ", self.name
+ e.dump()
+ cleanup_error(e.rc)
+
+ def safe_to_clean(self):
+ return not is_network_prepared()
def cleanup(self):
self.info(self.net_type, self.nid, self.port)
- if self.net_type in ('tcp', 'toe'):
+ if self.port:
stop_acceptor(self.port)
+ if node_is_router():
+ self.disconnect_peer_gateways()
try:
lctl.disconnectAll(self.net_type)
except CommandError, e:
e.dump()
cleanup_error(e.rc)
-class Router(Module):
+class RouteTable(Module):
def __init__(self,db):
- Module.__init__(self, 'ROUTER', db)
+ Module.__init__(self, 'ROUTES', db)
def prepare(self):
if is_network_prepared():
return
self.info()
- for net_type, gw, lo, hi in self.db.get_route_tbl():
+ for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
lctl.add_route(net_type, gw, lo, hi)
- if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
+ if net_type in ('tcp', 'toe') and local_net_type(net_type) and lo == hi:
srvdb = self.db.nid2server(lo, net_type)
-
if not srvdb:
panic("no server for nid", lo)
else:
srv = Network(srvdb)
lctl.connect(srv)
+
+ def safe_to_clean(self):
+ return not is_network_prepared()
+
def cleanup(self):
- for net_type, gw, lo, hi in self.db.get_route_tbl():
+ if is_network_prepared():
+ # the network is still being used, don't clean it up
+ return
+ for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
srvdb = self.db.nid2server(lo, net_type)
if not srvdb:
class LDLM(Module):
def __init__(self,db):
Module.__init__(self, 'LDLM', db)
+ self.add_lustre_module('obdclass', 'obdclass')
+ self.add_lustre_module('ptlrpc', 'ptlrpc')
self.add_lustre_module('ldlm', 'ldlm')
+
def prepare(self):
if is_prepared(self.uuid):
return
self.info()
lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid))
- def cleanup(self):
- if is_prepared(self.uuid):
- Module.cleanup(self)
-class PTLRPC(Module):
- def __init__(self,db):
- Module.__init__(self, 'PTLRPC', db)
- self.add_lustre_module('ptlrpc', 'ptlrpc')
- def prepare(self):
- if is_prepared(self.uuid):
- return
- self.info()
- lctl.newdev(attach="ptlrpc %s %s" % (self.name, self.uuid))
+ def safe_to_clean(self):
+ out = lctl.device_list()
+ return len(out) <= 1
+
def cleanup(self):
if is_prepared(self.uuid):
Module.cleanup(self)
self.devlist = self.db.get_refs('obd')
self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
self.osclist = []
- self.mdc_uudi = ''
+ self.mdc_uuid = ''
for obd_uuid in self.devlist:
obd = self.db.lookup(obd_uuid)
osc = get_osc(obd, self.name)
return
for osc in self.osclist:
try:
- # Ignore connection failures, because the LOV will DTRT with
- # an unconnected OSC.
- osc.prepare(ignore_connect_failure=1)
- except CommandError:
+ # Only ignore connect failures with --force, which
+ # isn't implemented here yet.
+ osc.prepare(ignore_connect_failure=0)
+ except CommandError, e:
print "Error preparing OSC %s (inactive)\n" % osc.uuid
+ raise e
self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
self.stripe_off, self.pattern, self.devlist, self.mds_name)
Module.__init__(self, 'MDSDEV', db)
self.devpath = self.db.get_val('devpath','')
self.size = self.db.get_val_int('devsize', 0)
+ self.journal_size = self.db.get_val_int('journalsize', 0)
self.fstype = self.db.get_val('fstype', '')
# overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
target_uuid = self.db.get_first_ref('target')
mds = self.db.lookup(target_uuid)
self.name = mds.getName()
self.lovconfig_uuids = mds.get_refs('lovconfig')
+ self.filesystem_uuids = mds.get_refs('filesystem')
# FIXME: if fstype not set, then determine based on kernel version
self.format = self.db.get_val('autoformat', "no")
-
- active_uuid = mds.get_active_target()
+ if mds.get_val('failover', 0):
+ self.failover_mds = 'f'
+ else:
+ self.failover_mds = ''
+ active_uuid = get_active_target(mds)
if not active_uuid:
panic("No target device found:", target_uuid)
if active_uuid == self.uuid:
self.active = 1
else:
self.active = 0
+ if self.active and config.group and config.group != ost.get_val('group'):
+ self.active = 0
+
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
# modules
- if self.fstype == 'extN':
- self.add_lustre_module('extN', 'extN')
self.add_lustre_module('mds', 'mds')
if self.fstype:
self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
return
self.info(self.devpath, self.fstype, self.format)
run_acceptors()
- blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
+ blkdev = block_dev(self.devpath, self.size, self.fstype, self.format,
+ self.journal_size)
if not is_prepared('MDT_UUID'):
lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
setup ="")
db = self.db.lookup(uuid)
lovconfig = LOVConfig(db)
lovconfig.prepare()
+ if config.mds_ost_conn:
+ for uuid in self.filesystem_uuids:
+ log("open clients for filesystem:", uuid)
+ fs = self.db.lookup(uuid)
+ obd_uuid = fs.get_first_ref('obd')
+ client = VOSC(self.db.lookup(obd_uuid), self.name)
+ client.prepare()
+
+ def msd_remaining(self):
+ out = lctl.device_list()
+ for s in out:
+ if string.split(s)[2] in ('mds',):
+ return 1
+
+ def safe_to_clean(self):
+ return self.active
+
+ def safe_to_clean_modules(self):
+ return not self.msd_remaining()
+
def cleanup(self):
- if is_prepared('MDT_UUID'):
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+ if is_prepared(self.uuid):
+ self.info()
try:
- lctl.cleanup("MDT", "MDT_UUID")
+ lctl.cleanup(self.name, self.uuid, config.force,
+ config.failover)
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+ Module.cleanup(self)
+ if config.mds_ost_conn:
+ for uuid in self.filesystem_uuids:
+ log("clean clients for filesystem:", uuid)
+ log("open clients for filesystem:", uuid)
+ fs = self.db.lookup(uuid)
+ obd_uuid = fs.get_first_ref('obd')
+ client = VOSC(self.db.lookup(obd_uuid), self.name)
+ client.cleanup()
+ if not self.msd_remaining() and is_prepared('MDT_UUID'):
+ try:
+ lctl.cleanup("MDT", "MDT_UUID", config.force,
+ config.failover)
except CommandError, e:
print "cleanup failed: ", self.name
e.dump()
cleanup_error(e.rc)
- if is_prepared(self.uuid):
- Module.cleanup(self)
clean_loop(self.devpath)
class OSD(Module):
self.osdtype = self.db.get_val('osdtype')
self.devpath = self.db.get_val('devpath', '')
self.size = self.db.get_val_int('devsize', 0)
+ self.journal_size = self.db.get_val_int('journalsize', 0)
self.fstype = self.db.get_val('fstype', '')
target_uuid = self.db.get_first_ref('target')
ost = self.db.lookup(target_uuid)
self.name = ost.getName()
- # FIXME: if fstype not set, then determine based on kernel version
self.format = self.db.get_val('autoformat', 'yes')
- if self.fstype == 'extN':
- self.add_lustre_module('extN', 'extN')
+ if ost.get_val('failover', 0):
+ self.failover_ost = 'f'
+ else:
+ self.failover_ost = ''
- active_uuid = ost.get_active_target()
+ active_uuid = get_active_target(ost)
if not active_uuid:
panic("No target device found:", target_uuid)
if active_uuid == self.uuid:
self.active = 1
else:
self.active = 0
+ if self.active and config.group and config.group != ost.get_val('group'):
+ self.active = 0
+
self.target_dev_uuid = self.uuid
self.uuid = target_uuid
# modules
self.add_lustre_module('ost', 'ost')
- self.add_lustre_module(self.osdtype, self.osdtype)
+ # FIXME: should we default to ext3 here?
if self.fstype:
self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
+ self.add_lustre_module(self.osdtype, self.osdtype)
def load_module(self):
if self.active:
if not self.active:
debug(self.uuid, "not active")
return
- self.info(self.osdtype, self.devpath, self.size, self.fstype, self.format)
+ self.info(self.osdtype, self.devpath, self.size, self.fstype,
+ self.format, self.journal_size)
run_acceptors()
if self.osdtype == 'obdecho':
blkdev = ''
else:
- blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
+ blkdev = block_dev(self.devpath, self.size, self.fstype,
+ self.format, self.journal_size)
lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
- setup ="%s %s" %(blkdev, self.fstype))
+ setup ="%s %s %s" %(blkdev, self.fstype,
+ self.failover_ost))
if not is_prepared('OSS_UUID'):
lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
setup ="")
+ def osd_remaining(self):
+ out = lctl.device_list()
+ for s in out:
+ if string.split(s)[2] in ('obdfilter', 'obdecho'):
+ return 1
+
+ def safe_to_clean(self):
+ return self.active
+
+ def safe_to_clean_modules(self):
+ return not self.osd_remaining()
+
def cleanup(self):
- if is_prepared('OSS_UUID'):
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+ if is_prepared(self.uuid):
+ self.info()
try:
- lctl.cleanup("OSS", "OSS_UUID")
+ lctl.cleanup(self.name, self.uuid, config.force,
+ config.failover)
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+ if not self.osd_remaining() and is_prepared('OSS_UUID'):
+ try:
+ lctl.cleanup("OSS", "OSS_UUID", config.force,
+ config.failover)
except CommandError, e:
print "cleanup failed: ", self.name
e.dump()
cleanup_error(e.rc)
- if is_prepared(self.uuid):
- Module.cleanup(self)
if not self.osdtype == 'obdecho':
clean_loop(self.devpath)
self.target_uuid = tgtdb.getUUID()
self.db = tgtdb
- self.tgt_dev_uuid = tgtdb.get_active_target()
+ self.tgt_dev_uuid = get_active_target(tgtdb)
if not self.tgt_dev_uuid:
panic("No target device found for target:", self.target_name)
self.module = module
self.module_name = string.upper(module)
- self.name = '%s_%s_%s' % (self.module_name, owner, self.target_name)
- self.uuid = '%05x%05x_%.14s_%05x%05x' % (int(random.random() * 1048576),
- int(random.random() * 1048576),self.name,
+ self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
+ self.target_name, owner)
+ self.uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
+ self.name,
int(random.random() * 1048576),
int(random.random() * 1048576))
self.uuid = self.uuid[0:36]
def lookup_server(self, srv_uuid):
""" Lookup a server's network information """
- self._server_nets = self.db.get_ost_net(srv_uuid)
+ self._server_nets = get_ost_net(self.db, srv_uuid)
if len(self._server_nets) == 0:
panic ("Unable to find a server for:", srv_uuid)
return self._server_nets
def prepare(self, ignore_connect_failure = 0):
- if is_prepared(self.uuid):
- return
self.info(self.target_uuid)
+ if is_prepared_name(self.name):
+ self.cleanup()
try:
- srv = local_net(self.get_servers())
+ srv = choose_local_server(self.get_servers())
if srv:
lctl.connect(srv)
else:
lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
else:
panic ("no route to", self.target_uuid)
- except CommandError:
- if (ignore_connect_failure == 0):
- pass
+ except CommandError, e:
+ if not ignore_connect_failure:
+ raise e
if srv:
lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
setup ="%s %s" %(self.target_uuid, srv.uuid))
def cleanup(self):
- Module.cleanup(self)
- srv = local_net(self.get_servers())
- if srv:
+ if is_prepared_name(self.name):
+ Module.cleanup(self)
try:
- lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+ srv = choose_local_server(self.get_servers())
+ if srv:
+ lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
+ else:
+ srv, r = find_route(self.get_servers())
+ if srv:
+ lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
except CommandError, e:
- log(self.module_name, "disconnect failed: ", self.name)
+ log(self.module_name, "cleanup failed: ", self.name)
e.dump()
cleanup_error(e.rc)
- else:
- self.info(self.target_uuid)
- srv, r = find_route(self.get_servers())
- if srv:
- try:
- lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
- except CommandError, e:
- print "del_route failed: ", self.name
- e.dump()
- cleanup_error(e.rc)
-
class MDC(Client):
def __init__(self,db):
Module.__init__(self, 'MTPT', db)
self.path = self.db.get_val('path')
- self.mds_uuid = self.db.get_first_ref('mds')
- self.obd_uuid = self.db.get_first_ref('obd')
+ self.fs_uuid = self.db.get_first_ref('filesystem')
+ fs = self.db.lookup(self.fs_uuid)
+ self.mds_uuid = fs.get_first_ref('mds')
+ self.obd_uuid = fs.get_first_ref('obd')
obd = self.db.lookup(self.obd_uuid)
self.vosc = VOSC(obd, self.name)
if self.vosc.need_mdc():
def prepare(self):
+ if fs_is_mounted(self.path):
+ log(self.path, "already mounted.")
+ return
self.vosc.prepare()
if self.vosc.need_mdc():
mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
else:
mdc_uuid = self.vosc.get_mdc_uuid()
if not mdc_uuid:
+ self.vosc.cleanup()
panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.")
self.info(self.path, self.mds_uuid, self.obd_uuid)
+ if config.lctl_dump:
+ cmd = "osc=%s,mdc=%s" % (self.vosc.get_uuid(), mdc_uuid)
+ lctl.mount_option(cmd)
+ return
cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
(self.vosc.get_uuid(), mdc_uuid, self.path)
run("mkdir", self.path)
ret, val = run(cmd)
if ret:
- panic("mount failed:", self.path)
+ self.vosc.cleanup()
+ if self.vosc.need_mdc():
+ cleanup_mdc(self.db, self.name, self.mds_uuid)
+ panic("mount failed:", self.path, ":", string.join(val))
def cleanup(self):
self.info(self.path, self.mds_uuid,self.obd_uuid)
- if fs_is_mounted(self.path):
- if config.force():
+ if fs_is_mounted(self.path):
+ if config.force:
(rc, out) = run("umount", "-f", self.path)
else:
(rc, out) = run("umount", self.path)
# ============================================================
-# XML processing and query
-
-class LustreDB:
- def lookup(self, uuid):
- """ lookup returns a new LustreDB instance"""
- return self._lookup_by_uuid(uuid)
-
- def lookup_name(self, name, class_name = ""):
- """ lookup returns a new LustreDB instance"""
- return self._lookup_by_name(name, class_name)
-
- def lookup_class(self, class_name):
- """ lookup returns a new LustreDB instance"""
- return self._lookup_by_class(class_name)
-
- def get_val(self, tag, default=None):
- v = self._get_val(tag)
- if v:
- return v
- if default != None:
- return default
- debug("LustreDB", self.getName(), " no value for:", tag)
- return None
-
- def get_class(self):
- return self._get_class()
-
- def get_val_int(self, tag, default=0):
- str = self._get_val(tag)
- try:
- if str:
- return int(str)
- return default
- except ValueError:
- panic("text value is not integer:", str)
-
- def get_first_ref(self, tag):
- """ Get the first uuidref of the type TAG. Only
- one is expected. Returns the uuid."""
- uuids = self._get_refs(tag)
- if len(uuids) > 0:
- return uuids[0]
- return None
-
- def get_refs(self, tag):
- """ Get all the refs of type TAG. Returns list of uuids. """
- uuids = self._get_refs(tag)
- return uuids
-
- def get_all_refs(self):
- """ Get all the refs. Returns list of uuids. """
- uuids = self._get_all_refs()
- return uuids
-
- def get_ost_net(self, osd_uuid):
- srv_list = []
- if not osd_uuid:
- return srv_list
- osd = self.lookup(osd_uuid)
- node_uuid = osd.get_first_ref('node')
- node = self.lookup(node_uuid)
- if not node:
- panic("unable to find node for osd_uuid:", osd_uuid,
- " node_ref:", node_uuid)
- for net_uuid in node.get_networks():
- db = node.lookup(net_uuid)
- srv_list.append(Network(db))
- return srv_list
+# misc query functions
- def nid2server(self, nid, net_type):
- netlist = self.lookup_class('network')
- for net_db in netlist:
- if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type:
- return net_db
- return None
-
- # the tag name is the service type
- # fixme: this should do some checks to make sure the dom_node is a service
- #
- # determine what "level" a particular node is at.
-
- # the order of iniitailization is based on level.
- def getServiceLevel(self):
- type = self.get_class()
- ret=0;
- if type in ('network',):
- ret = 5
- elif type in ('routetbl',):
- ret = 6
- elif type in ('ptlrpc',):
- ret = 7
- elif type in ('device', 'ldlm'):
- ret = 20
- elif type in ('osd', 'mdd', 'cobd'):
- ret = 30
- elif type in ('mdsdev','ost'):
- ret = 40
- elif type in ('mdc','osc'):
- ret = 50
- elif type in ('lov',):
- ret = 60
- elif type in ('mountpoint', 'echoclient'):
- ret = 70
-
- if ret < config.minlevel() or ret > config.maxlevel():
- ret = 0
- return ret
-
- #
- # return list of services in a profile. list is a list of tuples
- # [(level, db_object),]
- def getServices(self):
- list = []
- for ref_class, ref_uuid in self.get_all_refs():
- servdb = self.lookup(ref_uuid)
- if servdb:
- level = servdb.getServiceLevel()
- if level > 0:
- list.append((level, servdb))
- else:
- panic('service not found: ' + ref_uuid)
-
- list.sort()
- return list
-
- # Find the target_device for target on a node
- # node->profiles->device_refs->target
- def get_target_device(self, target_uuid, node_name):
- node_db = self.lookup_name(node_name)
- if not node_db:
- return None
- prof_list = node_db.get_refs('profile')
- for prof_uuid in prof_list:
- prof_db = node_db.lookup(prof_uuid)
- ref_list = prof_db.get_all_refs()
- for ref in ref_list:
- dev = self.lookup(ref[1])
- if dev and dev.get_first_ref('target') == target_uuid:
- return ref[1]
- return None
-
- def get_active_target(self):
- target_uuid = self.getUUID()
- target_name = self.getName()
- node_name = config.select(target_name)
- if node_name:
- tgt_dev_uuid = self.get_target_device(target_uuid, node_name)
- else:
- tgt_dev_uuid = self.get_first_ref('active')
- return tgt_dev_uuid
-
-
- # get all network uuids for this node
- def get_networks(self):
- ret = []
- prof_list = self.get_refs('profile')
- for prof_uuid in prof_list:
- prof_db = self.lookup(prof_uuid)
- net_list = prof_db.get_refs('network')
- #debug("get_networks():", prof_uuid, net_list)
- for net_uuid in net_list:
- ret.append(net_uuid)
- return ret
-
-class LustreDB_XML(LustreDB):
- def __init__(self, dom, root_node):
- # init xmlfile
- self.dom_node = dom
- self.root_node = root_node
-
- def xmltext(self, dom_node, tag):
- list = dom_node.getElementsByTagName(tag)
- if len(list) > 0:
- dom_node = list[0]
- dom_node.normalize()
- if dom_node.firstChild:
- txt = string.strip(dom_node.firstChild.data)
- if txt:
- return txt
-
- def xmlattr(self, dom_node, attr):
- return dom_node.getAttribute(attr)
-
- def _get_val(self, tag):
- """a value could be an attribute of the current node
- or the text value in a child node"""
- ret = self.xmlattr(self.dom_node, tag)
- if not ret:
- ret = self.xmltext(self.dom_node, tag)
- return ret
-
- def _get_class(self):
- return self.dom_node.nodeName
-
- #
- # [(ref_class, ref_uuid),]
- def _get_all_refs(self):
- list = []
- for n in self.dom_node.childNodes:
- if n.nodeType == n.ELEMENT_NODE:
- ref_uuid = self.xml_get_ref(n)
- ref_class = n.nodeName
- list.append((ref_class, ref_uuid))
-
- list.sort()
- return list
-
- def _get_refs(self, tag):
- """ Get all the refs of type TAG. Returns list of uuids. """
- uuids = []
- refname = '%s_ref' % tag
- reflist = self.dom_node.getElementsByTagName(refname)
- for r in reflist:
- uuids.append(self.xml_get_ref(r))
- return uuids
-
- def xmllookup_by_uuid(self, dom_node, uuid):
- for n in dom_node.childNodes:
- if n.nodeType == n.ELEMENT_NODE:
- if self.xml_get_uuid(n) == uuid:
- return n
- else:
- n = self.xmllookup_by_uuid(n, uuid)
- if n: return n
- return None
-
- def _lookup_by_uuid(self, uuid):
- dom = self. xmllookup_by_uuid(self.root_node, uuid)
- if dom:
- return LustreDB_XML(dom, self.root_node)
-
- def xmllookup_by_name(self, dom_node, name):
- for n in dom_node.childNodes:
- if n.nodeType == n.ELEMENT_NODE:
- if self.xml_get_name(n) == name:
- return n
- else:
- n = self.xmllookup_by_name(n, name)
- if n: return n
- return None
-
- def _lookup_by_name(self, name, class_name):
- dom = self.xmllookup_by_name(self.root_node, name)
- if dom:
- return LustreDB_XML(dom, self.root_node)
-
- def xmllookup_by_class(self, dom_node, class_name):
- return dom_node.getElementsByTagName(class_name)
-
- def _lookup_by_class(self, class_name):
- ret = []
- domlist = self.xmllookup_by_class(self.root_node, class_name)
- for node in domlist:
- ret.append(LustreDB_XML(node, self.root_node))
- return ret
-
- def xml_get_name(self, n):
- return n.getAttribute('name')
-
- def getName(self):
- return self.xml_get_name(self.dom_node)
-
- def xml_get_ref(self, n):
- return n.getAttribute('uuidref')
-
- def xml_get_uuid(self, dom_node):
- return dom_node.getAttribute('uuid')
-
- def getUUID(self):
- return self.xml_get_uuid(self.dom_node)
-
- def get_routes(self, type, gw):
- """ Return the routes as a list of tuples of the form:
- [(type, gw, lo, hi),]"""
- res = []
- tbl = self.dom_node.getElementsByTagName('routetbl')
- for t in tbl:
- routes = t.getElementsByTagName('route')
- for r in routes:
- net_type = self.xmlattr(r, 'type')
- if type != net_type:
- lo = self.xmlattr(r, 'lo')
- hi = self.xmlattr(r, 'hi')
- res.append((type, gw, lo, hi))
- return res
-
- def get_route_tbl(self):
- ret = []
- for r in self.dom_node.getElementsByTagName('route'):
- net_type = self.xmlattr(r, 'type')
- gw = self.xmlattr(r, 'gw')
- lo = self.xmlattr(r, 'lo')
- hi = self.xmlattr(r, 'hi')
- ret.append((net_type, gw, lo, hi))
- return ret
-
-
-# ================================================================
-# LDAP Support
-class LustreDB_LDAP(LustreDB):
- def __init__(self, name, attrs,
- base = "fs=lustre",
- parent = None,
- url = "ldap://localhost",
- user = "cn=Manager, fs=lustre",
- pw = "secret"
- ):
- self._name = name
- self._attrs = attrs
- self._base = base
- self._parent = parent
- self._url = url
- self._user = user
- self._pw = pw
- if parent:
- self.l = parent.l
- self._base = parent._base
- else:
- self.open()
-
- def open(self):
- import ldap
- try:
- self.l = ldap.initialize(self._url)
- # Set LDAP protocol version used
- self.l.protocol_version=ldap.VERSION3
- # user and pw only needed if modifying db
- self.l.bind_s("", "", ldap.AUTH_SIMPLE);
- except ldap.LDAPError, e:
- panic(e)
- # FIXME, do something useful here
-
- def close(self):
- self.l.unbind_s()
-
- def ldap_search(self, filter):
- """Return list of uuids matching the filter."""
- import ldap
- dn = self._base
- ret = []
- uuids = []
- try:
- for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL,
- filter, ["uuid"]):
- for v in attrs['uuid']:
- uuids.append(v)
- except ldap.NO_SUCH_OBJECT, e:
- pass
- except ldap.LDAPError, e:
- print e # FIXME: die here?
- if len(uuids) > 0:
- for uuid in uuids:
- ret.append(self._lookup_by_uuid(uuid))
- return ret
-
- def _lookup_by_name(self, name, class_name):
- list = self.ldap_search("lustreName=%s" %(name))
- if len(list) == 1:
- return list[0]
- return []
-
- def _lookup_by_class(self, class_name):
- return self.ldap_search("objectclass=%s" %(string.upper(class_name)))
-
- def _lookup_by_uuid(self, uuid):
- import ldap
- dn = "uuid=%s,%s" % (uuid, self._base)
- ret = None
- try:
- for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE,
- "objectclass=*"):
- ret = LustreDB_LDAP(name, attrs, parent = self)
-
- except ldap.NO_SUCH_OBJECT, e:
- debug("NO_SUCH_OBJECT:", uuid)
- pass # just return empty list
- except ldap.LDAPError, e:
- print e # FIXME: die here?
- return ret
+def get_ost_net(self, osd_uuid):
+ srv_list = []
+ if not osd_uuid:
+ return srv_list
+ osd = self.lookup(osd_uuid)
+ node_uuid = osd.get_first_ref('node')
+ node = self.lookup(node_uuid)
+ if not node:
+ panic("unable to find node for osd_uuid:", osd_uuid,
+ " node_ref:", node_uuid)
+ for net_uuid in node.get_networks():
+ db = node.lookup(net_uuid)
+ srv_list.append(Network(db))
+ return srv_list
+
+
+# the order of iniitailization is based on level.
+def getServiceLevel(self):
+ type = self.get_class()
+ ret=0;
+ if type in ('network',):
+ ret = 5
+ elif type in ('routetbl',):
+ ret = 6
+ elif type in ('ldlm',):
+ ret = 20
+ elif type in ('osd', 'cobd'):
+ ret = 30
+ elif type in ('mdsdev',):
+ ret = 40
+ elif type in ('mountpoint', 'echoclient'):
+ ret = 70
+ else:
+ panic("Unknown type: ", type)
+ if ret < config.minlevel or ret > config.maxlevel:
+ ret = 0
+ return ret
- def _get_val(self, k):
- ret = None
- if self._attrs.has_key(k):
- v = self._attrs[k]
- if type(v) == types.ListType:
- ret = str(v[0])
+#
+# return list of services in a profile. list is a list of tuples
+# [(level, db_object),]
+def getServices(self):
+ list = []
+ for ref_class, ref_uuid in self.get_all_refs():
+ servdb = self.lookup(ref_uuid)
+ if servdb:
+ level = getServiceLevel(servdb)
+ if level > 0:
+ list.append((level, servdb))
else:
- ret = str(v)
- return ret
-
- def _get_class(self):
- return string.lower(self._attrs['objectClass'][0])
-
- #
- # [(ref_class, ref_uuid),]
- def _get_all_refs(self):
- list = []
- for k in self._attrs.keys():
- if re.search('.*Ref', k):
- for uuid in self._attrs[k]:
- list.append((k, uuid))
- return list
+ panic('service not found: ' + ref_uuid)
- def _get_refs(self, tag):
- """ Get all the refs of type TAG. Returns list of uuids. """
- uuids = []
- refname = '%sRef' % tag
- if self._attrs.has_key(refname):
- return self._attrs[refname]
- return []
+ list.sort()
+ return list
- def getName(self):
- return self._get_val('lustreName')
-
- def getUUID(self):
- return self._get_val('uuid')
-
- def get_route_tbl(self):
- return []
############################################################
# MDC UUID hack -
############################################################
# routing ("rooting")
-#
-routes = []
-local_node = []
-router_flag = 0
-def add_local_interfaces(node_db):
- global local_node
+# list of (nettype, cluster_id)
+local_clusters = []
+
+def find_local_clusters(node_db):
+ global local_clusters
for netuuid in node_db.get_networks():
net = node_db.lookup(netuuid)
srv = Network(net)
debug("add_local", netuuid)
- local_node.append((srv.net_type, srv.nid))
- if acceptors.has_key(srv.port):
- panic("duplicate port:", srv.port)
- if srv.net_type in ('tcp', 'toe'):
+ local_clusters.append((srv.net_type, srv.cluster_id))
+ if srv.port > 0:
+ if acceptors.has_key(srv.port):
+ panic("duplicate port:", srv.port)
acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
srv.send_mem, srv.recv_mem,
srv.irq_affinity,
srv.nid_exchange)
+# This node is a gateway.
+is_router = 0
+def node_is_router():
+ return is_router
+
+# If there are any routers found in the config, then this will be true
+# and all nodes will load kptlrouter.
+needs_router = 0
def node_needs_router():
- return router_flag
+ return needs_router or is_router
+
+# list of (nettype, gw, tgt_cluster_id, lo, hi)
+# Currently, these local routes are only added to kptlrouter route
+# table if they are needed to connect to a specific server. This
+# should be changed so all available routes are loaded, and the
+# ptlrouter can make all the decisions.
+local_routes = []
-def init_route_config(lustre):
- """ Scan the lustre config looking for routers. Build list of
+def find_local_routes(lustre):
+ """ Scan the lustre config looking for routers . Build list of
routes. """
- global routes, router_flag
- routes = []
+ global local_routes, needs_router
+ local_routes = []
list = lustre.lookup_class('node')
- for node_db in list:
- if node_db.get_val_int('router', 0):
- router_flag = 1
- #debug("init_route_config: found router", node_db.getName())
- for (local_type, local_nid) in local_node:
- #debug("init_route_config:", local_type, local_nid)
+ for router in list:
+ if router.get_val_int('router', 0):
+ needs_router = 1
+ for (local_type, local_cluster_id) in local_clusters:
gw = None
- for netuuid in node_db.get_networks():
- db = node_db.lookup(netuuid)
- if local_type == db.get_val('nettype'):
+ for netuuid in router.get_networks():
+ db = router.lookup(netuuid)
+ if (local_type == db.get_val('nettype') and
+ local_cluster_id == db.get_val('clusterid')):
gw = db.get_val('nid')
break
- #debug("init_route_config: gw is", gw)
- if not gw:
- continue
- for route in node_db.get_routes(local_type, gw):
- routes.append(route)
- debug("init_route_config routes:", routes)
-
-
-def local_net(srv_list):
- global local_node
- for iface in local_node:
- for srv in srv_list:
- #debug("local_net a:", srv.net_type, "b:", iface[0])
- if srv.net_type == iface[0]:
- return srv
- return None
+ if gw:
+ debug("find_local_routes: gw is", gw)
+ for route in router.get_local_routes(local_type, gw):
+ local_routes.append(route)
+ debug("find_local_routes:", local_routes)
+
+
+def choose_local_server(srv_list):
+ for srv in srv_list:
+ if local_net_type(srv.net_type):
+ return srv
def local_net_type(net_type):
- global local_node
- for iface in local_node:
- if net_type == iface[0]:
+ for cluster in local_clusters:
+ if net_type == cluster[0]:
return 1
return 0
def find_route(srv_list):
- global local_node, routes
- frm_type = local_node[0][0]
+ frm_type = local_clusters[0][0]
for srv in srv_list:
- #debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
+ debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
to_type = srv.net_type
- to = srv.hostaddr
- #debug ('looking for route to', to_type, to)
- for r in routes:
- #debug("find_route: ", r)
- if r[2] == to:
+ to = srv.hostaddr # XXX should this be hostaddr, or nid?
+ cluster_id = srv.cluster_id
+ debug ('looking for route to', to_type, to)
+ for r in local_routes:
+ debug("find_route: ", r)
+ if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
return srv, r
return None,None
+def get_active_target(db):
+ target_uuid = db.getUUID()
+ target_name = db.getName()
+ node_name = get_select(target_name)
+ if node_name:
+ tgt_dev_uuid = db.get_target_device(target_uuid, node_name)
+ else:
+ tgt_dev_uuid = db.get_first_ref('active')
+ return tgt_dev_uuid
+
############################################################
# lconf level logic
n = None
if type == 'ldlm':
n = LDLM(db)
- elif type == 'ptlrpc':
- n = PTLRPC(db)
elif type == 'lov':
n = LOV(db)
elif type == 'network':
n = Network(db)
elif type == 'routetbl':
- n = Router(db)
+ n = RouteTable(db)
elif type == 'osd':
n = OSD(db)
elif type == 'cobd':
prof_db = db.lookup(prof_uuid)
if not prof_db:
panic("profile:", profile, "not found.")
- services = prof_db.getServices()
+ services = getServices(prof_db)
operation(services)
def doSetup(services):
- if config.nosetup():
+ if config.nosetup:
return
for s in services:
n = newService(s[1])
n.prepare()
def doModules(services):
- if config.nomod():
+ if config.nomod:
return
for s in services:
n = newService(s[1])
n.load_module()
def doCleanup(services):
- if config.nosetup():
+ if config.nosetup:
return
services.reverse()
for s in services:
n = newService(s[1])
- n.cleanup()
+ if n.safe_to_clean():
+ n.cleanup()
def doUnloadModules(services):
- if config.nomod():
+ if config.nomod:
return
services.reverse()
for s in services:
n = newService(s[1])
- n.cleanup_module()
+ if n.safe_to_clean_modules():
+ n.cleanup_module()
#
# Load profile for
def doHost(lustreDB, hosts):
- global routes
- global router_flag
+ global is_router
node_db = None
for h in hosts:
node_db = lustreDB.lookup_name(h, 'node')
print 'No host entry found.'
return
- router_flag = node_db.get_val_int('router', 0)
- recovery_upcall = node_db.get_val('recovery_upcall', '')
+ is_router = node_db.get_val_int('router', 0)
+ lustre_upcall = node_db.get_val('lustreUpcall', '')
+ portals_upcall = node_db.get_val('portalsUpcall', '')
timeout = node_db.get_val_int('timeout', 0)
- add_local_interfaces(node_db)
- if not router_flag:
- init_route_config(lustreDB)
+ find_local_clusters(node_db)
+ if not is_router:
+ find_local_routes(lustreDB)
# Two step process: (1) load modules, (2) setup lustre
# if not cleaning, load modules first.
prof_list = node_db.get_refs('profile')
- if config.cleanup():
- if config.force():
+ if config.recover:
+ if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
+ raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
+ "--client_uuid <UUID> --conn_uuid <UUID>")
+ doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
+ config.conn_uuid)
+ elif config.cleanup:
+ if config.force:
# the command line can override this value
timeout = 5
# ugly hack, only need to run lctl commands for --dump
- if config.lctl_dump():
+ if config.lctl_dump:
for_each_profile(node_db, prof_list, doCleanup)
return
sys_set_timeout(timeout)
- sys_set_recovery_upcall(recovery_upcall)
+ sys_set_ptldebug()
+ sys_set_subsystem()
+ sys_set_lustre_upcall(lustre_upcall)
+ sys_set_portals_upcall(portals_upcall)
for_each_profile(node_db, prof_list, doCleanup)
for_each_profile(node_db, prof_list, doUnloadModules)
else:
# ugly hack, only need to run lctl commands for --dump
- if config.lctl_dump():
+ if config.lctl_dump:
for_each_profile(node_db, prof_list, doSetup)
return
+ sys_make_devices()
+ sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
+ sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+
for_each_profile(node_db, prof_list, doModules)
sys_set_debug_path()
- script = config.gdb_script()
+ sys_set_ptldebug()
+ sys_set_subsystem()
+ script = config.gdb_script
run(lctl.lctl, ' modules >', script)
- if config.gdb():
+ if config.gdb:
log ("The GDB module script is in", script)
# pause, so user has time to break and
# load the script
time.sleep(5)
sys_set_timeout(timeout)
- sys_set_recovery_upcall(recovery_upcall)
+ sys_set_lustre_upcall(lustre_upcall)
+ sys_set_portals_upcall(portals_upcall)
for_each_profile(node_db, prof_list, doSetup)
-############################################################
-# Command line processing
-#
-def parse_cmdline(argv):
- short_opts = "hdnvf"
- long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
- "portals=", "makeldiff", "cleanup", "noexec",
- "help", "node=", "nomod", "nosetup",
- "dump=", "force", "minlevel=", "maxlevel=",
- "timeout=", "recovery_upcall=",
- "ldapurl=", "config=", "select=", "lctl_dump="]
- opts = []
- args = []
+def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid):
+ tgt = db.lookup(tgt_uuid)
+ if not tgt:
+ raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
+ new_uuid = get_active_target(tgt)
+ if not new_uuid:
+ raise Lustre.LconfError("doRecovery: no active target found for: " +
+ tgt_uuid)
+ net = choose_local_server(get_ost_net(db, new_uuid))
+ if not net:
+ raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
+ # XXX, better to do a full disconnect here
+ log("Reconnecting", tgt_uuid, " to ", net.uuid);
+ lctl.del_uuid(conn_uuid)
+ lctl.connect(net)
+ lctl.recover(client_uuid, net.uuid)
- try:
- opts, args = getopt.getopt(argv, short_opts, long_opts)
- except getopt.error:
- print "invalid opt"
- usage()
-
- for o, a in opts:
- if o in ("-h", "--help"):
- usage()
- if o in ("-d","--cleanup"):
- config.cleanup(1)
- if o in ("-v", "--verbose"):
- config.verbose(1)
- if o in ("-n", "--noexec"):
- config.noexec(1)
- if o == "--portals":
- config.portals_dir(a)
- if o == "--lustre":
- config.lustre_dir(a)
- if o == "--reformat":
- config.reformat(1)
- if o == "--node":
- config.node(a)
- if o == "--gdb":
- config.gdb(1)
- if o == "--nomod":
- config.nomod(1)
- if o == "--nosetup":
- config.nosetup(1)
- if o == "--dump":
- config.dump_file(a)
- if o in ("-f", "--force"):
- config.force(1)
- if o == "--minlevel":
- config.minlevel(a)
- if o == "--maxlevel":
- config.maxlevel(a)
- if o == "--timeout":
- config.timeout(a)
- if o == "--recovery_upcall":
- config.recovery_upcall(a)
- if o == "--ldapurl":
- config.ldapurl(a)
- if o == "--config":
- config.config_name(a)
- if o == "--select":
- config.init_select(a)
- if o == "--lctl_dump":
- config.lctl_dump(a)
-
- return args
-
-def fetch(url):
- import urllib
- data = ""
- try:
- s = urllib.urlopen(url)
- data = s.read()
- except:
- usage()
- return data
def setupModulePath(cmd, portals_dir = PORTALS_DIR):
base = os.path.dirname(cmd)
- if os.access(base+"/Makefile", os.R_OK):
- if not config.lustre_dir():
- config.lustre_dir(os.path.join(base, ".."))
+ if development_mode():
+ if not config.lustre:
+ config.lustre = (os.path.join(base, ".."))
# normalize the portals dir, using command line arg if set
- if config.portals_dir():
- portals_dir = config.portals_dir()
- dir = os.path.join(config.lustre_dir(), portals_dir)
- config.portals_dir(dir)
- elif config.lustre_dir() and config.portals_dir():
+ if config.portals:
+ portals_dir = config.portals
+ dir = os.path.join(config.lustre, portals_dir)
+ config.portals = dir
+ debug('config.portals', config.portals)
+ elif config.lustre and config.portals:
# production mode
# if --lustre and --portals, normalize portals
# can ignore POTRALS_DIR here, since it is probly useless here
- dir = config.portals_dir()
- dir = os.path.join(config.lustre_dir(), dir)
- config.portals_dir(dir)
+ config.portals = os.path.join(config.lustre, config.portals)
+ debug('config.portals B', config.portals)
def sysctl(path, val):
- if config.noexec():
+ debug("+ sysctl", path, val)
+ if config.noexec:
return
try:
fp = open(os.path.join('/proc/sys', path), 'w')
fp.write(str(val))
fp.close()
except IOError, e:
- print e
+ panic(str(e))
def sys_set_debug_path():
- debug("debug path: ", config.debug_path())
- sysctl('portals/debug_path', config.debug_path())
+ sysctl('portals/debug_path', config.debug_path)
-def sys_set_recovery_upcall(upcall):
+def sys_set_lustre_upcall(upcall):
# the command overrides the value in the node config
- if config.recovery_upcall():
- upcall = config.recovery_upcall()
+ if config.lustre_upcall:
+ upcall = config.lustre_upcall
+ elif config.upcall:
+ upcall = config.upcall
if upcall:
- debug("setting recovery_upcall:", upcall)
- sysctl('lustre/recovery_upcall', upcall)
+ sysctl('lustre/upcall', upcall)
+
+def sys_set_portals_upcall(upcall):
+ # the command overrides the value in the node config
+ if config.portals_upcall:
+ upcall = config.portals_upcall
+ elif config.upcall:
+ upcall = config.upcall
+ if upcall:
+ sysctl('portals/upcall', upcall)
def sys_set_timeout(timeout):
# the command overrides the value in the node config
- if config.timeout() > 0:
- timeout = config.timeout()
- if timeout > 0:
- debug("setting timeout:", timeout)
+ if config.timeout > 0:
+ timeout = config.timeout
+ if timeout != None and timeout > 0:
sysctl('lustre/timeout', timeout)
-def sys_set_ptldebug(ptldebug):
- # the command overrides the value in the node config
- if config.ptldebug():
- ptldebug = config.ptldebug()
- sysctl('portals/debug', ptldebug)
+def sys_set_ptldebug():
+ if config.ptldebug != None:
+ try:
+ val = eval(config.ptldebug, ptldebug_names)
+ val = "0x%x" % (val,)
+ sysctl('portals/debug', val)
+ except NameError, e:
+ panic(str(e))
+
+def sys_set_subsystem():
+ if config.subsystem != None:
+ try:
+ val = eval(config.ptldebug, ptldebug_names)
+ val = "0x%x" % (val,)
+ sysctl('portals/subsystem_debug', val)
+ except NameError, e:
+ panic(str(e))
def sys_set_netmem_max(path, max):
debug("setting", path, "to at least", max)
- if config.noexec():
+ if config.noexec:
return
fp = open(path)
str = fp.readline()
return
os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
+def default_debug_path():
+ path = '/tmp/lustre-log'
+ if os.path.isdir('/r'):
+ return '/r' + path
+ else:
+ return path
+
+def default_gdb_script():
+ script = '/tmp/ogdb'
+ if os.path.isdir('/r'):
+ return '/r' + script
+ else:
+ return script
+
DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
# ensure basic elements are in the system path
for dir in DEFAULT_PATH:
add_to_path(dir)
-# Initialize or shutdown lustre according to a configuration file
-# * prepare the system for lustre
-# * configure devices with lctl
-# Shutdown does steps in reverse
-#
+# global hack for the --select handling
+tgt_select = {}
+def init_select(arg):
+ # arg = "service=nodeA,service2=nodeB"
+ global tgt_select
+ list = string.split(arg, ',')
+ for entry in list:
+ srv, node = string.split(entry, '=')
+ tgt_select[srv] = node
+
+def get_select(srv):
+ if tgt_select.has_key(srv):
+ return tgt_select[srv]
+ return None
+
+
+PARAM = Lustre.Options.PARAM
+INTPARAM = Lustre.Options.INTPARAM
+lconf_options = [
+ ('verbose,v', "Print system commands as they are run"),
+ ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
+ ('config', "Cluster config name used for LDAP query", PARAM),
+ ('select', "service=nodeA,service2=nodeB ", PARAM),
+ ('node', "Load config for <nodename>", PARAM),
+ ('cleanup,d', "Cleans up config. (Shutdown)"),
+ ('force,f', "Forced unmounting and/or obd detach during cleanup",
+ Lustre.Options.FLAG, 0),
+ ('mds_ost_conn', "Open connections to OSTs on the MDS"),
+ ('failover',"""Used to shut down without saving state.
+ This will allow this node to "give up" a service to a
+ another node for failover purposes. This will not
+ be a clean shutdown.""",
+ Lustre.Options.FLAG, 0),
+ ('gdb', """Prints message after creating gdb module script
+ and sleeps for 5 seconds."""),
+ ('noexec,n', """Prints the commands and steps that will be run for a
+ config without executing them. This can used to check if a
+ config file is doing what it should be doing"""),
+ ('nomod', "Skip load/unload module step."),
+ ('nosetup', "Skip device setup/cleanup step."),
+ ('reformat', "Reformat all devices (without question)"),
+ ('dump', "Dump the kernel debug log to file before portals is unloaded",
+ PARAM),
+ ('minlevel', "Minimum level of services to configure/cleanup",
+ INTPARAM, 0),
+ ('maxlevel', """Maximum level of services to configure/cleanup
+ Levels are aproximatly like:
+ 10 - network
+ 20 - device, ldlm
+ 30 - osd, mdd
+ 40 - mds, ost
+ 70 - mountpoint, echo_client, osc, mdc, lov""",
+ INTPARAM, 100),
+ ('lustre', """Base directory of lustre sources. This parameter will
+ cause lconf to load modules from a source tree.""", PARAM),
+ ('portals', """Portals source directory. If this is a relative path,
+ then it is assumed to be relative to lustre. """, PARAM),
+ ('timeout', "Set recovery timeout", PARAM),
+ ('upcall', "Set both portals and lustre upcall script", PARAM),
+ ('lustre_upcall', "Set lustre upcall script", PARAM),
+ ('portals_upcall', "Set portals upcall script", PARAM),
+ ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
+ ('ptldebug', "Set the portals debug level", PARAM),
+ ('subsystem', "Set the portals debug subsystem", PARAM),
+ ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
+ ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
+# Client recovery options
+ ('recover', "Recover a device"),
+ ('group', "The group of devices to configure or cleanup", PARAM),
+ ('tgt_uuid', "The failed target (required for recovery)", PARAM),
+ ('client_uuid', "The failed client (required for recovery)", PARAM),
+ ('conn_uuid', "The failed connection (required for recovery)", PARAM),
+ ]
+
def main():
- global lctl, MAXTCPBUF
+ global lctl, config
+
+ # in the upcall this is set to SIG_IGN
+ signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+ cl = Lustre.Options("lconf", "config.xml", lconf_options)
+ try:
+ config, args = cl.parse(sys.argv[1:])
+ except Lustre.OptionError, e:
+ print e
+ sys.exit(1)
+
+ setupModulePath(sys.argv[0])
host = socket.gethostname()
sanitise_path()
- args = parse_cmdline(sys.argv[1:])
if len(args) > 0:
if not os.access(args[0], os.R_OK):
print 'File not found or readable:', args[0]
except Exception:
panic("%s does not appear to be a config file." % (args[0]))
sys.exit(1) # make sure to die here, even in debug mode.
- db = LustreDB_XML(dom.documentElement, dom.documentElement)
- elif config.ldapurl():
- if not config.config_name():
+ db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
+ elif config.ldapurl:
+ if not config.config:
panic("--ldapurl requires --config name")
- dn = "config=%s,fs=lustre" % (config.config_name())
- db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl())
+ dn = "config=%s,fs=lustre" % (config.config)
+ db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
else:
- usage()
+ cl.usage()
+ sys.exit(1)
+
+ ver = db.get_version()
+ if not ver:
+ panic("No version found in config data, please recreate.")
+ if ver != Lustre.CONFIG_VERSION:
+ panic("Config version", ver, "does not match lconf version",
+ Lustre.CONFIG_VERSION)
node_list = []
- if config.node():
- node_list.append(config.node())
+ if config.node:
+ node_list.append(config.node)
else:
if len(host) > 0:
node_list.append(host)
node_list.append('localhost')
+
debug("configuring for host: ", node_list)
if len(host) > 0:
- config._debug_path = config._debug_path + '-' + host
- config._gdb_script = config._gdb_script + '-' + host
-
- setupModulePath(sys.argv[0])
+ config.debug_path = config.debug_path + '-' + host
+ config.gdb_script = config.gdb_script + '-' + host
lctl = LCTLInterface('lctl')
- if config.lctl_dump():
- lctl.use_save_file(config.lctl_dump())
- else:
- sys_make_devices()
- sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
- sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+
+ if config.lctl_dump:
+ lctl.use_save_file(config.lctl_dump)
doHost(db, node_list)
if __name__ == "__main__":
try:
main()
- except LconfError, e:
+ except Lustre.LconfError, e:
print e
except CommandError, e:
e.dump()
if first_cleanup_error:
sys.exit(first_cleanup_error)
-