#!/usr/bin/env python
#
-# Copyright (C) 2002 Cluster File Systems, Inc.
-# Author: Robert Read <rread@clusterfs.com>
+# Copyright (C) 2002-2003 Cluster File Systems, Inc.
+# Authors: Robert Read <rread@clusterfs.com>
+# Mike Shaver <shaver@clusterfs.com>
# This file is part of Lustre, http://www.lustre.org.
#
# Lustre is free software; you can redistribute it and/or
#
# Based in part on the XML obdctl modifications done by Brian Behlendorf
-import sys, getopt
-import string, os, stat, popen2, socket, time
-import re, exceptions
+import sys, getopt, types
+import string, os, stat, popen2, socket, time, random, fcntl, select
+import re, exceptions, signal, traceback
import xml.dom.minidom
+if sys.version[0] == '1':
+ from FCNTL import F_GETFL, F_SETFL
+else:
+ from fcntl import F_GETFL, F_SETFL
+
+PYMOD_DIR = "/usr/lib/lustre/python"
+
+def development_mode():
+ base = os.path.dirname(sys.argv[0])
+ if os.access(base+"/Makefile", os.R_OK):
+ return 1
+ return 0
+
+if development_mode():
+ sys.path.append('../utils')
+else:
+ sys.path.append(PYMOD_DIR)
+
+import Lustre
+
# Global parameters
-TCP_ACCEPTOR = ''
-MAXTCPBUF = 1048576
-DEFAULT_TCPBUF = 1048576
+MAXTCPBUF = 16777216
+DEFAULT_TCPBUF = 8388608
+DEFAULT_PORT = 988
#
# Maximum number of devices to search for.
# (the /dev/loop* nodes need to be created beforehand)
MAX_LOOP_DEVICES = 256
+PORTALS_DIR = '../portals'
+
+# Needed to call lconf --record
+CONFIG_FILE = ""
+
+# Please keep these in sync with the values in portals/kp30.h
+ptldebug_names = {
+ "trace" : (1 << 0),
+ "inode" : (1 << 1),
+ "super" : (1 << 2),
+ "ext2" : (1 << 3),
+ "malloc" : (1 << 4),
+ "cache" : (1 << 5),
+ "info" : (1 << 6),
+ "ioctl" : (1 << 7),
+ "blocks" : (1 << 8),
+ "net" : (1 << 9),
+ "warning" : (1 << 10),
+ "buffs" : (1 << 11),
+ "other" : (1 << 12),
+ "dentry" : (1 << 13),
+ "portals" : (1 << 14),
+ "page" : (1 << 15),
+ "dlmtrace" : (1 << 16),
+ "error" : (1 << 17),
+ "emerg" : (1 << 18),
+ "ha" : (1 << 19),
+ "rpctrace" : (1 << 20),
+ "vfstrace" : (1 << 21),
+ "reada" : (1 << 22),
+ "mmap" : (1 << 23),
+ "config" : (1 << 24),
+ "console" : (1 << 25),
+ "quota" : (1 << 26),
+ "sec" : (1 << 27),
+}
+
+subsystem_names = {
+ "undefined" : (1 << 0),
+ "mdc" : (1 << 1),
+ "mds" : (1 << 2),
+ "osc" : (1 << 3),
+ "ost" : (1 << 4),
+ "class" : (1 << 5),
+ "log" : (1 << 6),
+ "llite" : (1 << 7),
+ "rpc" : (1 << 8),
+ "mgmt" : (1 << 9),
+ "portals" : (1 << 10),
+ "nal" : (1 << 11),
+ "pinger" : (1 << 12),
+ "filter" : (1 << 13),
+ "ptlbd" : (1 << 14),
+ "echo" : (1 << 15),
+ "ldlm" : (1 << 16),
+ "lov" : (1 << 17),
+ "ptlrouter" : (1 << 18),
+ "cobd" : (1 << 19),
+ "sm" : (1 << 20),
+ "asobd" : (1 << 21),
+ "confobd" : (1 << 22),
+ "lmv" : (1 << 23),
+ "cmobd" : (1 << 24),
+ "sec" : (1 << 25),
+ }
+
first_cleanup_error = 0
def cleanup_error(rc):
if not first_cleanup_error:
first_cleanup_error = rc
-
-def usage():
- print """usage: lconf config.xml
-
-config.xml Lustre configuration in xml format.
---get <url> URL to fetch a config file
---node <nodename> Load config for <nodename>
--d | --cleanup Cleans up config. (Shutdown)
--f | --force Unmount with \"umount -f\" during shutdown
--v | --verbose Print system commands as they are run
--h | --help Print this help
---gdb Prints message after creating gdb module script
- and sleeps for 5 seconds.
--n | --noexec Prints the commands and steps that will be run for a
- config without executing them. This can used to check if a
- config file is doing what it should be doing. (Implies -v)
---nomod Skip load/unload module step.
---nosetup Skip device setup/cleanup step.
---reformat Reformat all devices (without question)
---dump <file> Dump the kernel debug log before portals is unloaded
-"""
- TODO = """
---ldap server LDAP server with lustre config database
---makeldiff Translate xml source to LDIFF
-This are perhaps not needed:
---lustre="src dir" Base directory of lustre sources. Used to search
- for modules.
---portals=src Portals source
-"""
- sys.exit()
-
-# ============================================================
-# Config parameters, encapsulated in a class
-class Config:
- def __init__(self):
- # flags
- self._noexec = 0
- self._verbose = 0
- self._reformat = 0
- self._cleanup = 0
- self._gdb = 0
- self._nomod = 0
- self._nosetup = 0
- self._force = 0
- # parameters
- self._modules = None
- self._node = None
- self._url = None
- self._gdb_script = '/tmp/ogdb'
- self._debug_path = '/tmp/lustre-log'
- self._dump_file = None
- self._src_dir = None
-
- def verbose(self, flag = None):
- if flag: self._verbose = flag
- return self._verbose
-
- def noexec(self, flag = None):
- if flag: self._noexec = flag
- return self._noexec
-
- def reformat(self, flag = None):
- if flag: self._reformat = flag
- return self._reformat
-
- def cleanup(self, flag = None):
- if flag: self._cleanup = flag
- return self._cleanup
-
- def gdb(self, flag = None):
- if flag: self._gdb = flag
- return self._gdb
-
- def nomod(self, flag = None):
- if flag: self._nomod = flag
- return self._nomod
-
- def nosetup(self, flag = None):
- if flag: self._nosetup = flag
- return self._nosetup
-
- def force(self, flag = None):
- if flag: self._force = flag
- return self._force
-
- def node(self, val = None):
- if val: self._node = val
- return self._node
-
- def url(self, val = None):
- if val: self._url = val
- return self._url
-
- def gdb_script(self):
- if os.path.isdir('/r'):
- return '/r' + self._gdb_script
- else:
- return self._gdb_script
-
- def debug_path(self):
- if os.path.isdir('/r'):
- return '/r' + self._debug_path
- else:
- return self._debug_path
-
- def src_dir(self, val = None):
- if val: self._src_dir = val
- return self._src_dir
-
- def dump_file(self, val = None):
- if val: self._dump_file = val
- return self._dump_file
-
-config = Config()
-
# ============================================================
# debugging and error funcs
def fixme(msg = "this feature"):
- raise LconfError, msg + ' not implmemented yet.'
+ raise Lustre.LconfError, msg + ' not implemented yet.'
def panic(*args):
msg = string.join(map(str,args))
- if not config.noexec():
- raise LconfError(msg)
+ if not config.noexec:
+ raise Lustre.LconfError(msg)
else:
print "! " + msg
print string.strip(s)
def debug(*args):
- if config.verbose():
+ if config.verbose:
msg = string.join(map(str,args))
print msg
+# ack, python's builtin int() does not support '0x123' syntax.
+# eval can do it, although what a hack!
+def my_int(s):
+ try:
+ if s[0:2] == '0x':
+ return eval(s, {}, {})
+ else:
+ return int(s)
+ except SyntaxError, e:
+ raise ValueError("not a number")
+ except NameError, e:
+ raise ValueError("not a number")
+
# ============================================================
# locally defined exceptions
class CommandError (exceptions.Exception):
else:
print self.cmd_err
-class LconfError (exceptions.Exception):
- def __init__(self, args):
- self.args = args
+# ============================================================
+# handle daemons, like the acceptor
+class DaemonHandler:
+ """ Manage starting and stopping a daemon. Assumes daemon manages
+ it's own pid file. """
+
+ def __init__(self, cmd):
+ self.command = cmd
+ self.path =""
+
+ def start(self):
+ if self.running():
+ log(self.command, "already running.")
+ if not self.path:
+ self.path = find_prog(self.command)
+ if not self.path:
+ panic(self.command, "not found.")
+ ret, out = runcmd(self.path +' '+ self.command_line())
+ if ret:
+ raise CommandError(self.path, out, ret)
+
+ def stop(self):
+ if self.running():
+ pid = self.read_pidfile()
+ try:
+ if pid != 1:
+ log ("killing process", pid)
+ os.kill(pid, 15)
+ else:
+ log("was unable to find pid of " + self.command)
+ #time.sleep(1) # let daemon die
+ except OSError, e:
+ log("unable to kill", self.command, e)
+ if self.running():
+ log("unable to kill", self.command)
+
+ def running(self):
+ pid = self.read_pidfile()
+ if pid:
+ try:
+ if pid != 1:
+ os.kill(pid, 0)
+ else:
+ log("was unable to find pid of " + self.command)
+ except OSError:
+ self.clean_pidfile()
+ else:
+ return 1
+ return 0
+
+ def read_pidfile(self):
+ try:
+ fp = open(self.pidfile(), 'r')
+ val = fp.read()
+ if val == '':
+ val = '1'
+ pid = int(val)
+ fp.close()
+ return pid
+ except IOError:
+ return 0
+
+ def clean_pidfile(self):
+ """ Remove a stale pidfile """
+ log("removing stale pidfile:", self.pidfile())
+ try:
+ os.unlink(self.pidfile())
+ except OSError, e:
+ log(self.pidfile(), e)
+
+class AcceptorHandler(DaemonHandler):
+ def __init__(self, port, net_type):
+ DaemonHandler.__init__(self, "acceptor")
+ self.port = port
+ self.flags = ''
+
+ def pidfile(self):
+ return "/var/run/%s-%d.pid" % (self.command, self.port)
+
+ def command_line(self):
+ return string.join(map(str,(self.flags, self.port)))
+
+acceptors = {}
+
+# start the acceptors
+def run_acceptors():
+ if config.lctl_dump or config.record:
+ return
+ for port in acceptors.keys():
+ daemon = acceptors[port]
+ if not daemon.running():
+ daemon.start()
+
+def run_one_acceptor(port):
+ if config.lctl_dump or config.record:
+ return
+ if acceptors.has_key(port):
+ daemon = acceptors[port]
+ if not daemon.running():
+ daemon.start()
+ else:
+ panic("run_one_acceptor: No acceptor defined for port:", port)
+
+def stop_acceptor(port):
+ if acceptors.has_key(port):
+ daemon = acceptors[port]
+ if daemon.running():
+ daemon.stop()
+
# ============================================================
# handle lctl interface
Initialize close by finding the lctl binary.
"""
self.lctl = find_prog(cmd)
+ self.save_file = ''
+ self.record_device = ''
if not self.lctl:
- if config.noexec():
+ if config.noexec:
debug('! lctl not found')
self.lctl = 'lctl'
else:
raise CommandError('lctl', "unable to find lctl binary.")
+ def use_save_file(self, file):
+ self.save_file = file
+
+ def record(self, dev_name, logname):
+ log("Recording log", logname, "on", dev_name)
+ self.record_device = dev_name
+ self.record_log = logname
+
+ def end_record(self):
+ log("End recording log", self.record_log, "on", self.record_device)
+ self.record_device = None
+ self.record_log = None
+
+ def set_nonblock(self, fd):
+ fl = fcntl.fcntl(fd, F_GETFL)
+ fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
+
def run(self, cmds):
"""
run lctl
should modify command line to accept multiple commands, or
create complex command line options
"""
- debug("+", self.lctl, cmds)
- if config.noexec(): return (0, [])
- p = popen2.Popen3(self.lctl, 1)
- p.tochild.write(cmds + "\n")
- p.tochild.close()
- out = p.fromchild.readlines()
- err = p.childerr.readlines()
- ret = p.wait()
+ cmd_line = self.lctl
+ if self.save_file:
+ cmds = '\n dump ' + self.save_file + '\n' + cmds
+ elif self.record_device:
+ cmds = """
+ device $%s
+ record %s
+ %s""" % (self.record_device, self.record_log, cmds)
+
+ debug("+", cmd_line, cmds)
+ if config.noexec: return (0, [])
+
+ child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
+ child.tochild.write(cmds + "\n")
+ child.tochild.close()
+# print "LCTL:", cmds
+
+ # From "Python Cookbook" from O'Reilly
+ outfile = child.fromchild
+ outfd = outfile.fileno()
+ self.set_nonblock(outfd)
+ errfile = child.childerr
+ errfd = errfile.fileno()
+ self.set_nonblock(errfd)
+
+ outdata = errdata = ''
+ outeof = erreof = 0
+ while 1:
+ ready = select.select([outfd,errfd],[],[]) # Wait for input
+ if outfd in ready[0]:
+ outchunk = outfile.read()
+ if outchunk == '': outeof = 1
+ outdata = outdata + outchunk
+ if errfd in ready[0]:
+ errchunk = errfile.read()
+ if errchunk == '': erreof = 1
+ errdata = errdata + errchunk
+ if outeof and erreof: break
+ # end of "borrowed" code
+
+ ret = child.wait()
if os.WIFEXITED(ret):
rc = os.WEXITSTATUS(ret)
else:
rc = 0
- if rc or len(err):
- raise CommandError(self.lctl, err, rc)
+ if rc or len(errdata):
+ raise CommandError(self.lctl, errdata, rc)
+ return rc, outdata
+
+ def runcmd(self, *args):
+ """
+ run lctl using the command line
+ """
+ cmd = string.join(map(str,args))
+ debug("+", self.lctl, cmd)
+ rc, out = run(self.lctl, cmd)
+ if rc:
+ raise CommandError(self.lctl, out, rc)
return rc, out
-
+ def clear_log(self, dev, log):
+ """ clear an existing log """
+ cmds = """
+ device $%s
+ probe
+ clear_log %s
+ quit """ % (dev, log)
+ self.run(cmds)
+
+ def root_squash(self, name, uid, nid):
+ cmds = """
+ device $%s
+ root_squash %s %s
+ quit""" % (name, uid, nid)
+ self.run(cmds)
+
def network(self, net, nid):
- """ initialized network and add "self" """
- # Idea: "mynid" could be used for all network types to add "self," and then
- # this special case would be gone and the "self" hack would be hidden.
- if net == 'tcp':
- cmds = """
+ """ set mynid """
+ cmds = """
network %s
mynid %s
- add_uuid self %s
- quit""" % (net, nid, nid)
- else:
- cmds = """
+ quit """ % (net, nid)
+ self.run(cmds)
+
+ # add an interface
+ def add_interface(self, net, ip, netmask = ""):
+ """ add an interface """
+ cmds = """
network %s
- add_uuid self %s
- quit""" % (net, nid)
-
+ add_interface %s %s
+ quit """ % (net, ip, netmask)
+ self.run(cmds)
+
+ # delete an interface
+ def del_interface(self, net, ip):
+ """ delete an interface """
+ cmds = """
+ network %s
+ del_interface %s
+ quit """ % (net, ip)
self.run(cmds)
# create a new connection
- def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
- if net == 'tcp':
+ def add_uuid(self, net_type, uuid, nid):
+ cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
+ self.run(cmds)
+
+ def add_peer(self, net_type, nid, hostaddr, port):
+ if net_type in ('tcp','openib','ra') and not config.lctl_dump:
cmds = """
network %s
- add_uuid %s %s
- send_mem %d
- recv_mem %d
- connect %s %d
- quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
- else:
+ add_peer %s %s %d
+ quit""" % (net_type,
+ nid, hostaddr, port )
+ self.run(cmds)
+ elif net_type in ('iib',) and not config.lctl_dump:
cmds = """
network %s
- add_uuid %s %s
- connect %s %d
- quit""" % (net, servuuid, nid, nid, port, )
-
+ add_peer %s
+ quit""" % (net_type,
+ nid )
+ self.run(cmds)
+ elif net_type in ('vib',) and not config.lctl_dump:
+ cmds = """
+ network %s
+ add_peer %s %s
+ quit""" % (net_type,
+ nid, hostaddr )
+ self.run(cmds)
+
+ def connect(self, srv):
+ self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
+ if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
+ if srv.hostaddr[0]:
+ hostaddr = string.split(srv.hostaddr[0], '/')[0]
+ self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
+
+ # Recover a device
+ def recover(self, dev_name, new_conn):
+ cmds = """
+ device $%s
+ recover %s""" %(dev_name, new_conn)
self.run(cmds)
# add a route to a range
cmds = """
network %s
add_route %s %s %s
- quit """ % (net, gw, lo, hi)
- self.run(cmds)
-
+ quit """ % (net,
+ gw, lo, hi)
+ try:
+ self.run(cmds)
+ except CommandError, e:
+ log ("ignore: ")
+ e.dump()
- # add a route to a range
def del_route(self, net, gw, lo, hi):
cmds = """
ignore_errors
network %s
- del_route %s
- quit """ % (net, lo)
+ del_route %s %s %s
+ quit """ % (net, gw, lo, hi)
self.run(cmds)
# add a route to a host
def add_route_host(self, net, uuid, gw, tgt):
+ self.add_uuid(net, uuid, tgt)
cmds = """
network %s
- add_uuid %s %s
add_route %s %s
- quit """ % (net, uuid, tgt, gw, tgt)
- self.run(cmds)
+ quit """ % (net,
+ gw, tgt)
+ try:
+ self.run(cmds)
+ except CommandError, e:
+ log ("ignore: ")
+ e.dump()
# add a route to a range
def del_route_host(self, net, uuid, gw, tgt):
+ self.del_uuid(uuid)
cmds = """
ignore_errors
network %s
- del_uuid %s
- del_route %s
- quit """ % (net, uuid, tgt)
+ del_route %s %s
+ quit """ % (net, gw, tgt)
self.run(cmds)
+
+ def del_peer(self, net_type, nid, hostaddr):
+ if net_type in ('tcp',) and not config.lctl_dump:
+ cmds = """
+ ignore_errors
+ network %s
+ del_peer %s %s single_share
+ quit""" % (net_type,
+ nid, hostaddr)
+ self.run(cmds)
+ elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump:
+ cmds = """
+ ignore_errors
+ network %s
+ del_peer %s single_share
+ quit""" % (net_type,
+ nid)
+ self.run(cmds)
+
# disconnect one connection
- def disconnect(self, net, nid, port, servuuid):
+ def disconnect(self, srv):
+ self.del_uuid(srv.nid_uuid)
+ if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
+ if srv.hostaddr[0]:
+ hostaddr = string.split(srv.hostaddr[0], '/')[0]
+ self.del_peer(srv.net_type, srv.nid, hostaddr)
+
+ def del_uuid(self, uuid):
cmds = """
ignore_errors
- network %s
- disconnect %s
del_uuid %s
- quit""" % (net, nid, servuuid)
+ quit""" % (uuid,)
self.run(cmds)
# disconnect all
cmds = """
ignore_errors
network %s
- del_uuid self
disconnect
quit""" % (net)
self.run(cmds)
- # create a new device with lctl
- def newdev(self, attach, setup = ""):
+ def attach(self, type, name, uuid):
+ cmds = """
+ attach %s %s %s
+ quit""" % (type, name, uuid)
+ self.run(cmds)
+
+ def detach(self, name):
+ cmds = """
+ cfg_device %s
+ detach
+ quit""" % (name)
+ self.run(cmds)
+
+ def set_security(self, name, key, value):
cmds = """
- newdev
- attach %s
+ cfg_device %s
+ set_security %s %s
+ quit""" % (name, key, value)
+ self.run(cmds)
+
+ def setup(self, name, setup = ""):
+ cmds = """
+ cfg_device %s
setup %s
- quit""" % (attach, setup)
+ quit""" % (name, setup)
+ self.run(cmds)
+
+ def add_conn(self, name, conn_uuid):
+ cmds = """
+ cfg_device %s
+ add_conn %s
+ quit""" % (name, conn_uuid)
+ self.run(cmds)
+
+ def start(self, name, conf_name):
+ cmds = """
+ device $%s
+ start %s
+ quit""" % (name, conf_name)
self.run(cmds)
+ # create a new device with lctl
+ def newdev(self, type, name, uuid, setup = ""):
+ if type != 'mds':
+ self.attach(type, name, uuid);
+ try:
+ self.setup(name, setup)
+ except CommandError, e:
+ self.cleanup(name, uuid, 0)
+ raise e
+
# cleanup a device
- def cleanup(self, name, uuid):
+ def cleanup(self, name, uuid, force, failover = 0):
+ if failover: force = 1
cmds = """
ignore_errors
- device $%s
- cleanup
+ cfg_device $%s
+ cleanup %s %s
detach
- quit""" % (name)
+ quit""" % (name, ('', 'force')[force],
+ ('', 'failover')[failover])
self.run(cmds)
# create an lov
- def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
+ def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
+ stripe_sz, stripe_off, pattern, devlist = None):
+ cmds = """
+ attach lov %s %s
+ lov_setup %s %d %d %d %s %s
+ quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
+ pattern, devlist)
+ self.run(cmds)
+
+ # add an OBD to a LOV
+ def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
+ cmds = """
+ lov_modify_tgts add %s %s %s %s
+ quit""" % (name, obd_uuid, index, gen)
+ self.run(cmds)
+
+ # create an lmv
+ def lmv_setup(self, name, uuid, desc_uuid, devlist):
+ cmds = """
+ attach lmv %s %s
+ lmv_setup %s %s
+ quit""" % (name, uuid, desc_uuid, devlist)
+ self.run(cmds)
+
+ # delete an OBD from a LOV
+ def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
+ cmds = """
+ lov_modify_tgts del %s %s %s %s
+ quit""" % (name, obd_uuid, index, gen)
+ self.run(cmds)
+
+ # deactivate an OBD
+ def deactivate(self, name):
cmds = """
device $%s
- probe
- lovconfig %s %d %d %d %s %s
- quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
+ deactivate
+ quit""" % (name)
self.run(cmds)
# dump the log file
quit""" % (dump_file)
self.run(cmds)
+ # get list of devices
+ def device_list(self):
+ devices = '/proc/fs/lustre/devices'
+ ret = []
+ if os.access(devices, os.R_OK):
+ try:
+ fp = open(devices, 'r')
+ ret = fp.readlines()
+ fp.close()
+ except IOError, e:
+ log(e)
+ return ret
+
+ # get lustre version
+ def lustre_version(self):
+ rc, out = self.runcmd('version')
+ return out
+
+ # dump mount options
+ def mount_option(self, profile, osc, mdc):
+ cmds = """
+ mount_option %s %s %s
+ quit""" % (profile, osc, mdc)
+ self.run(cmds)
+
+ # delete mount options
+ def del_mount_option(self, profile):
+ cmds = """
+ del_mount_option %s
+ quit""" % (profile,)
+ self.run(cmds)
+
+ def set_timeout(self, timeout):
+ cmds = """
+ set_timeout %s
+ quit""" % (timeout,)
+ self.run(cmds)
+
+ def set_lustre_upcall(self, upcall):
+ cmds = """
+ set_lustre_upcall %s
+ quit""" % (upcall,)
+ self.run(cmds)
# ============================================================
# Various system-level functions
# (ideally moved to their own module)
# Run a command and return the output and status.
# stderr is sent to /dev/null, could use popen3 to
# save it if necessary
-def run(*args):
- cmd = string.join(map(str,args))
+def runcmd(cmd):
debug ("+", cmd)
- if config.noexec(): return (0, [])
+ if config.noexec: return (0, [])
f = os.popen(cmd + ' 2>&1')
out = f.readlines()
ret = f.close()
ret = 0
return (ret, out)
+def run(*args):
+ cmd = string.join(map(str,args))
+ return runcmd(cmd)
+
# Run a command in the background.
def run_daemon(*args):
cmd = string.join(map(str,args))
debug ("+", cmd)
- if config.noexec(): return 0
+ if config.noexec: return 0
f = os.popen(cmd + ' 2>&1')
ret = f.close()
if ret:
syspath = string.split(os.environ['PATH'], ':')
cmdpath = os.path.dirname(sys.argv[0])
syspath.insert(0, cmdpath);
- syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
+ if config.portals:
+ syspath.insert(0, os.path.join(config.portals, 'utils/'))
for d in syspath:
prog = os.path.join(d,cmd)
if os.access(prog, os.X_OK):
if module:
return module
-def find_module(src_dir, dev_dir, modname):
- mod = '%s.o' % (modname)
- module = src_dir +'/'+ dev_dir +'/'+ mod
- try:
- if os.access(module, os.R_OK):
- return module
- except OSError:
- pass
- return None
-
# is the path a block device?
def is_block(path):
s = ()
return 0
return stat.S_ISBLK(s[stat.ST_MODE])
+# find the journal device from mkfs options
+def jdev(opts):
+ if opts == None:
+ return ''
+ x=string.split(opts)
+ i=0
+ while i < len(x) - 1:
+ if x[i] == '-J' and x[i+1].startswith('device='):
+ str=x[i+1]
+ return str[7:]
+ i=i+1
+ return ''
+
# build fs according to type
# fixme: dangerous
-def mkfs(fstype, dev):
- if(fstype in ('ext3', 'extN')):
- mkfs = 'mkfs.ext2 -j -b 4096'
- else:
- print 'unsupported fs type: ', fstype
- if not is_block(dev):
- force = '-F'
+def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
+ block_cnt = ''
+ jopt = ''
+ iopt = ''
+ if devsize:
+ if devsize < 8000:
+ panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
+ (dev, devsize))
+ # devsize is in 1k, and fs block count is in 4k
+ block_cnt = devsize/4
+
+ if fstype in ('ext3', 'extN', 'ldiskfs'):
+ # ext3 journal size is in megabytes
+ # but don't set jsize if mkfsoptions indicates a separate journal device
+ if jsize == 0 and jdev(mkfsoptions) == '':
+ if devsize == 0:
+ if not is_block(dev):
+ ret, out = runcmd("ls -l %s" %dev)
+ devsize = int(string.split(out[0])[4]) / 1024
+ else:
+ # sfdisk works for symlink, hardlink, and realdev
+ ret, out = runcmd("sfdisk -s %s" %dev)
+ if not ret:
+ devsize = int(out[0])
+ else:
+ # sfdisk -s will fail for too large block device,
+ # then, read the size of partition from /proc/partitions
+
+ # get the realpath of the device
+ # it may be the real device, such as /dev/hda7
+ # or the hardlink created via mknod for a device
+ if 'realpath' in dir(os.path):
+ real_dev = os.path.realpath(dev)
+ else:
+ real_dev = dev
+ link_count = 0
+ while os.path.islink(real_dev) and (link_count < 20):
+ link_count = link_count + 1
+ dev_link = os.readlink(real_dev)
+ if os.path.isabs(dev_link):
+ real_dev = dev_link
+ else:
+ real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
+ if link_count > 19:
+ panic("Entountered too many symbolic links resolving block device:", dev)
+
+ # get the major and minor number of the realpath via ls
+ # it seems python(os.stat) does not return
+ # the st_rdev member of the stat structure
+ ret, out = runcmd("ls -l %s" %real_dev)
+ major = string.split(string.split(out[0])[4], ",")[0]
+ minor = string.split(out[0])[5]
+
+ # get the devsize from /proc/partitions with the major and minor number
+ ret, out = runcmd("cat /proc/partitions")
+ for line in out:
+ if len(line) > 1:
+ if string.split(line)[0] == major and string.split(line)[1] == minor:
+ devsize = int(string.split(line)[2])
+ break
+
+ if devsize > 1024 * 1024:
+ jsize = ((devsize / 102400) * 4)
+ if jsize > 400:
+ jsize = 400
+ if jsize: jopt = "-J size=%d" %(jsize,)
+ if isize: iopt = "-I %d" %(isize,)
+ mkfs = 'mkfs.ext2 -j -b 4096 '
+ if not isblock or config.force:
+ mkfs = mkfs + ' -F '
+ if jdev(mkfsoptions) != '':
+ jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
+ if config.force:
+ jmkfs = jmkfs + '-F '
+ jmkfs = jmkfs + jdev(mkfsoptions)
+ (ret, out) = run (jmkfs)
+ if ret:
+ panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
+ elif fstype == 'reiserfs':
+ # reiserfs journal size is in blocks
+ if jsize: jopt = "--journal_size %d" %(jsize,)
+ mkfs = 'mkreiserfs -ff'
else:
- force = ''
- (ret, out) = run (mkfs, force, dev)
+ panic('unsupported fs type: ', fstype)
+
+ if config.mkfsoptions != None:
+ mkfs = mkfs + ' ' + config.mkfsoptions
+ if mkfsoptions != None:
+ mkfs = mkfs + ' ' + mkfsoptions
+ (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
if ret:
- panic("Unable to build fs:", dev)
- # enable hash tree indexing on fs
- if fstype == 'extN':
+ panic("Unable to build fs:", dev, string.join(out))
+ # enable hash tree indexing on fsswe
+ if fstype in ('ext3', 'extN', 'ldiskfs'):
htree = 'echo "feature FEATURE_C5" | debugfs -w'
(ret, out) = run (htree, dev)
if ret:
panic ("can't access loop devices")
return loop
-# find loop device assigned to thefile
-def find_loop(file):
+# find loop device assigned to the file
+def find_assigned_loop(file):
loop = loop_base()
for n in xrange(0, MAX_LOOP_DEVICES):
dev = loop + str(n)
if os.access(dev, os.R_OK):
(stat, out) = run('losetup', dev)
- if (out and stat == 0):
+ if out and stat == 0:
m = re.search(r'\((.*)\)', out[0])
if m and file == m.group(1):
return dev
- else:
- break
return ''
-# create file if necessary and assign the first free loop device
-def init_loop(file, size, fstype):
- dev = find_loop(file)
- if dev:
- print 'WARNING file:', file, 'already mapped to', dev
- return dev
- if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
- run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
+# find free loop device
+def find_free_loop(file):
loop = loop_base()
+
# find next free loop
for n in xrange(0, MAX_LOOP_DEVICES):
dev = loop + str(n)
if os.access(dev, os.R_OK):
(stat, out) = run('losetup', dev)
- if (stat):
- run('losetup', dev, file)
+ if stat:
return dev
- else:
- print "out of loop devices"
- return ''
- print "out of loop devices"
return ''
-# undo loop assignment
-def clean_loop(file):
- dev = find_loop(file)
+# create file if necessary and assign the first free loop device
+def init_loop(file, size, fstype, journal_size, inode_size,
+ mkfsoptions, reformat, autoformat, backfstype, backfile):
+ if fstype == 'smfs':
+ realfile = backfile
+ realfstype = backfstype
+ if is_block(backfile):
+ if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
+ mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
+ return realfile
+ else:
+ realfile = file
+ realfstype = fstype
+
+ dev = find_assigned_loop(realfile)
if dev:
- ret, out = run('losetup -d', dev)
+ print 'WARNING: file', realfile, 'already mapped to', dev
+ return dev
+
+ if reformat or not os.access(realfile, os.R_OK | os.W_OK):
+ (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
if ret:
- log('unable to clean loop device:', dev, 'for file:', file)
- logall(out)
+ panic("Unable to create backing store:", realfile)
+ mkfs(realfile, size, realfstype, journal_size, inode_size,
+ mkfsoptions, isblock=0)
+
+ dev = find_free_loop(realfile)
+ if dev:
+ print "attach " + realfile + " <-> " + dev
+ run('losetup', dev, realfile)
+ return dev
+
+ print "out of loop devices"
+ return ''
+# undo loop assignment
+def clean_loop(dev, fstype, backfstype, backdev):
+ if fstype == 'smfs':
+ realfile = backdev
+ else:
+ realfile = dev
+ if not is_block(realfile):
+ dev = find_assigned_loop(realfile)
+ if dev:
+ print "detach " + dev + " <-> " + realfile
+ ret, out = run('losetup -d', dev)
+ if ret:
+ log('unable to clean loop device', dev, 'for file', realfile)
+ logall(out)
+
+# finilizes passed device
+def clean_dev(dev, fstype, backfstype, backdev):
+ if fstype == 'smfs' or not is_block(dev):
+ clean_loop(dev, fstype, backfstype, backdev)
+
# determine if dev is formatted as a <fstype> filesystem
def need_format(fstype, dev):
# FIXME don't know how to implement this
return 0
# initialize a block device if needed
-def block_dev(dev, size, fstype, format):
- if config.noexec(): return dev
- if not is_block(dev):
- dev = init_loop(dev, size, fstype)
- if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
- mkfs(fstype, dev)
-
+def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
+ inode_size, mkfsoptions, backfstype, backdev):
+ if config.noexec:
+ return dev
+
+ if fstype == 'smfs' or not is_block(dev):
+ dev = init_loop(dev, size, fstype, journal_size, inode_size,
+ mkfsoptions, reformat, autoformat, backfstype, backdev)
+ elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
+ mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
+ isblock=0)
# else:
# panic("device:", dev,
# "not prepared, and autoformat is not set.\n",
ip = string.split(addr, ':')[1]
return ip
-def get_local_address(net_type, wildcard):
+def def_mount_options(fstype, target):
+ """returns deafult mount options for passed fstype and target (mds, ost)"""
+ if fstype == 'ext3' or fstype == 'ldiskfs':
+ mountfsoptions = "errors=remount-ro"
+ if target == 'ost' and sys_get_branch() == '2.4':
+ mountfsoptions = "%s,asyncdel" % (mountfsoptions)
+ if target == 'ost' and sys_get_branch() == '2.6':
+ mountfsoptions = "%s,extents,mballoc" % (mountfsoptions)
+ return mountfsoptions
+ return ""
+
+def sys_get_elan_position_file():
+ procfiles = ["/proc/elan/device0/position",
+ "/proc/qsnet/elan4/device0/position",
+ "/proc/qsnet/elan3/device0/position"]
+ for p in procfiles:
+ if os.access(p, os.R_OK):
+ return p
+ return ""
+
+def sys_get_local_nid(net_type, wildcard, cluster_id):
+ """Return the local nid."""
+ local = ""
+ if sys_get_elan_position_file():
+ local = sys_get_local_address('elan', '*', cluster_id)
+ else:
+ local = sys_get_local_address(net_type, wildcard, cluster_id)
+ return local
+
+def sys_get_local_address(net_type, wildcard, cluster_id):
"""Return the local address for the network type."""
local = ""
- if net_type == 'tcp':
+ if net_type in ('tcp','openib','iib','vib','ra'):
if ':' in wildcard:
iface, star = string.split(wildcard, ':')
local = if2addr(iface)
host = socket.gethostname()
local = socket.gethostbyname(host)
elif net_type == 'elan':
- # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
+ # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
+ f = sys_get_elan_position_file()
+ if not f:
+ panic ("unable to determine local Elan ID")
try:
- fp = open('/proc/elan/device0/position', 'r')
+ fp = open(f, 'r')
lines = fp.readlines()
fp.close()
for l in lines:
a = string.split(l)
if a[0] == 'NodeId':
- local = a[1]
+ elan_id = a[1]
break
+ try:
+ nid = my_int(cluster_id) + my_int(elan_id)
+ local = "%d" % (nid)
+ except ValueError, e:
+ local = elan_id
except IOError, e:
log(e)
+ elif net_type == 'lo':
+ fixme("automatic local address for loopback")
elif net_type == 'gm':
fixme("automatic local address for GM")
+
return local
-
-
-# ============================================================
-# Classes to prepare and cleanup the various objects
-#
-class Module:
- """ Base class for the rest of the modules. The default cleanup method is
- defined here, as well as some utilitiy funcs.
- """
- def __init__(self, module_name, dom_node):
- self.dom_node = dom_node
- self.module_name = module_name
- self.name = get_attr(dom_node, 'name')
- self.uuid = get_attr(dom_node, 'uuid')
- self.kmodule_list = []
- self._server = None
- self._connected = 0
+def sys_get_branch():
+ """Returns kernel release"""
+ try:
+ fp = open('/proc/sys/kernel/osrelease')
+ lines = fp.readlines()
+ fp.close()
- def info(self, *args):
- msg = string.join(map(str,args))
- print self.module_name + ":", self.name, self.uuid, msg
-
-
- def lookup_server(self, srv_uuid):
- """ Lookup a server's network information """
- net = get_ost_net(self.dom_node.parentNode, srv_uuid)
- if not net:
- panic ("Unable to find a server for:", srv_uuid)
- self._server = Network(net)
-
- def get_server(self):
- return self._server
+ for l in lines:
+ version = string.split(l)
+ a = string.split(version[0], '.')
+ return a[0] + '.' + a[1]
+ except IOError, e:
+ log(e)
+ return ""
- def cleanup(self):
- """ default cleanup, used for most modules """
- self.info()
- srv = self.get_server()
- if srv and local_net(srv):
- try:
- lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
- except CommandError, e:
- log(self.module_name, "disconnect failed: ", self.name)
- e.dump()
- cleanup_error(e.rc)
- try:
- lctl.cleanup(self.name, self.uuid)
- except CommandError, e:
- log(self.module_name, "cleanup failed: ", self.name)
- e.dump()
- cleanup_error(e.rc)
+# XXX: instead of device_list, ask for $name and see what we get
+def is_prepared(name):
+ """Return true if a device exists for the name"""
+ if config.lctl_dump:
+ return 0
+ if (config.noexec or config.record) and config.cleanup:
+ return 1
+ try:
+ # expect this format:
+ # 1 UP ldlm ldlm ldlm_UUID 2
+ out = lctl.device_list()
+ for s in out:
+ if name == string.split(s)[3]:
+ return 1
+ except CommandError, e:
+ e.dump()
+ return 0
- def add_module(self, dev_dir, modname):
- """Append a module to list of modules to load."""
- self.kmodule_list.append((dev_dir, modname))
+def net_is_prepared():
+ """If the any device exists, then assume that all networking
+ has been configured"""
+ out = lctl.device_list()
+ return len(out) > 0
- def mod_loaded(self, modname):
- """Check if a module is already loaded. Look in /proc/modules for it."""
- fp = open('/proc/modules')
+def fs_is_mounted(path):
+ """Return true if path is a mounted lustre filesystem"""
+ try:
+ fp = open('/proc/mounts')
lines = fp.readlines()
fp.close()
- # please forgive my tired fingers for this one
- ret = filter(lambda word, mod=modname: word == mod,
- map(lambda line: string.split(line)[0], lines))
- return ret
+ for l in lines:
+ a = string.split(l)
+ if a[1] == path and a[2] == 'lustre_lite':
+ return 1
+ except IOError, e:
+ log(e)
+ return 0
- def load_module(self):
- """Load all the modules in the list in the order they appear."""
- for dev_dir, mod in self.kmodule_list:
- # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
- if self.mod_loaded(mod) and not config.noexec():
- continue
- log ('loading module:', mod)
- if config.src_dir():
- module = find_module(config.src_dir(),dev_dir, mod)
- if not module:
- panic('module not found:', mod)
- (rc, out) = run('/sbin/insmod', module)
- if rc:
+def kmod_find(src_dir, dev_dir, modname):
+ modbase = src_dir +'/'+ dev_dir +'/'+ modname
+ for modext in '.ko', '.o':
+ module = modbase + modext
+ try:
+ if os.access(module, os.R_OK):
+ return module
+ except OSError:
+ pass
+ return None
+
+def kmod_info(modname):
+ """Returns reference count for passed module name."""
+ try:
+ fp = open('/proc/modules')
+ lines = fp.readlines()
+ fp.close()
+
+ # please forgive my tired fingers for this one
+ ret = filter(lambda word, mod = modname: word[0] == mod,
+ map(lambda line: string.split(line), lines))
+ if not ret:
+ return ''
+ return ret[0]
+ except Exception, e:
+ return 0
+
+class kmod:
+ """Presents kernel module"""
+ def __init__(self, src_dir, dev_dir, name):
+ self.src_dir = src_dir
+ self.dev_dir = dev_dir
+ self.name = name
+
+ # FIXME we ignore the failure of loading gss module, because we might
+ # don't need it at all.
+ def load(self):
+ """Load module"""
+ log ('loading module:', self.name, 'srcdir',
+ self.src_dir, 'devdir', self.dev_dir)
+ if self.src_dir:
+ module = kmod_find(self.src_dir, self.dev_dir,
+ self.name)
+ if not module and self.name != 'ptlrpcs_gss':
+ panic('module not found:', self.name)
+ (rc, out) = run('/sbin/insmod', module)
+ if rc:
+ if self.name == 'ptlrpcs_gss':
+ print "Warning: not support gss security!"
+ else:
raise CommandError('insmod', out, rc)
- else:
- (rc, out) = run('/sbin/modprobe', mod)
- if rc:
+ else:
+ (rc, out) = run('/sbin/modprobe', self.name)
+ if rc:
+ if self.name == 'ptlrpcs_gss':
+ print "Warning: not support gss security!"
+ else:
raise CommandError('modprobe', out, rc)
-
- def cleanup_module(self):
+
+ def cleanup(self):
+ """Unload module"""
+ log('unloading module:', self.name)
+ (rc, out) = run('/sbin/rmmod', self.name)
+ if rc:
+ log('unable to unload module:', self.name +
+ "(" + self.refcount() + ")")
+ logall(out)
+
+ def info(self):
+ """Returns module info if any."""
+ return kmod_info(self.name)
+
+ def loaded(self):
+ """Returns 1 if module is loaded. Otherwise 0 is returned."""
+ if self.info():
+ return 1
+ else:
+ return 0
+
+ def refcount(self):
+ """Returns module refcount."""
+ info = self.info()
+ if not info:
+ return ''
+ return info[2]
+
+ def used(self):
+ """Returns 1 if module is used, otherwise 0 is returned."""
+ info = self.info()
+ if not info:
+ return 0
+ if len(info) > 3:
+ users = info[3]
+ if users and users != '(unused)' and users != '-':
+ return 1
+ else:
+ return 0
+ else:
+ return 0
+
+ def busy(self):
+ """Returns 1 if module is busy, otherwise 0 is returned."""
+ if self.loaded() and (self.used() or self.refcount() != '0'):
+ return 1
+ else:
+ return 0
+
+class kmod_manager:
+ """Manage kernel modules"""
+ def __init__(self, lustre_dir, portals_dir):
+ self.lustre_dir = lustre_dir
+ self.portals_dir = portals_dir
+ self.kmodule_list = []
+
+ def find_module(self, modname):
+ """Find module by module name"""
+ for mod in self.kmodule_list:
+ if mod.name == modname:
+ return mod
+ return ''
+
+ def add_portals_module(self, dev_dir, modname):
+ """Append a module to list of modules to load."""
+
+ mod = self.find_module(modname)
+ if not mod:
+ mod = kmod(self.portals_dir, dev_dir, modname)
+ self.kmodule_list.append(mod)
+
+ def add_lustre_module(self, dev_dir, modname):
+ """Append a module to list of modules to load."""
+
+ mod = self.find_module(modname)
+ if not mod:
+ mod = kmod(self.lustre_dir, dev_dir, modname)
+ self.kmodule_list.append(mod)
+
+ def load_modules(self):
+ """Load all the modules in the list in the order they appear."""
+ for mod in self.kmodule_list:
+ if mod.loaded() and not config.noexec:
+ continue
+ mod.load()
+
+ def cleanup_modules(self):
"""Unload the modules in the list in reverse order."""
rev = self.kmodule_list
rev.reverse()
- for dev_dir, mod in rev:
- if not self.mod_loaded(mod):
+ for mod in rev:
+ if (not mod.loaded() or mod.busy()) and not config.noexec:
continue
# debug hack
- if mod == 'portals' and config.dump_file():
- lctl.dump(config.dump_file())
- log('unloading module:', mod)
- if config.noexec():
- continue
- (rc, out) = run('/sbin/rmmod', mod)
- if rc:
- log('! unable to unload module:', mod)
- logall(out)
-
+ if mod.name == 'portals' and config.dump:
+ lctl.dump(config.dump)
+ mod.cleanup()
+
+# ============================================================
+# Classes to prepare and cleanup the various objects
+#
+class Module:
+ """ Base class for the rest of the modules. The default cleanup method is
+ defined here, as well as some utilitiy funcs.
+ """
+ def __init__(self, module_name, db):
+ self.db = db
+ self.module_name = module_name
+ self.name = self.db.getName()
+ self.uuid = self.db.getUUID()
+ self._server = None
+ self._connected = 0
+
+ def info(self, *args):
+ msg = string.join(map(str,args))
+ print self.module_name + ":", self.name, self.uuid, msg
+ def cleanup(self):
+ """ default cleanup, used for most modules """
+ self.info()
+ try:
+ lctl.cleanup(self.name, self.uuid, config.force)
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+
+ def add_module(self, manager):
+ """Adds all needed modules in the order they appear."""
+ return
+
+ def safe_to_clean(self):
+ return 1
+
+ def safe_to_clean_modules(self):
+ return self.safe_to_clean()
+
class Network(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'NETWORK', dom_node)
- self.net_type = get_attr(dom_node,'type')
- self.nid = get_text(dom_node, 'server', '*')
- self.port = get_text_int(dom_node, 'port', 0)
- self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
- self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
+ def __init__(self,db):
+ Module.__init__(self, 'NETWORK', db)
+ self.net_type = self.db.get_val('nettype')
+ self.nid = self.db.get_val('nid', '*')
+ self.cluster_id = self.db.get_val('clusterid', "0")
+ self.port = self.db.get_val_int('port', 0)
+
if '*' in self.nid:
- self.nid = get_local_address(self.net_type, self.nid)
+ self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
if not self.nid:
- panic("unable to set nid for", self.net_type, self.nid)
+ panic("unable to set nid for", self.net_type, self.nid, cluster_id)
+ self.generic_nid = 1
debug("nid:", self.nid)
-
- self.add_module('portals/linux/oslib/', 'portals')
- if node_needs_router():
- self.add_module('portals/linux/router', 'kptlrouter')
+ else:
+ self.generic_nid = 0
+
+ self.nid_uuid = self.nid_to_uuid(self.nid)
+ self.hostaddr = self.db.get_hostaddr()
+ if len(self.hostaddr) == 0:
+ self.hostaddr.append(self.nid)
+ if '*' in self.hostaddr[0]:
+ self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
+ if not self.hostaddr[0]:
+ panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
+ debug("hostaddr:", self.hostaddr[0])
+
+ def add_module(self, manager):
+ manager.add_portals_module("libcfs", 'libcfs')
+ manager.add_portals_module("portals", 'portals')
+
+ if node_needs_router():
+ manager.add_portals_module("router", 'kptlrouter')
if self.net_type == 'tcp':
- self.add_module('portals/linux/socknal', 'ksocknal')
+ manager.add_portals_module("knals/socknal", 'ksocknal')
if self.net_type == 'elan':
- self.add_module('portals/linux/rqswnal', 'kqswnal')
+ manager.add_portals_module("knals/qswnal", 'kqswnal')
if self.net_type == 'gm':
- self.add_module('portals/linux/gmnal', 'kgmnal')
- self.add_module('lustre/obdclass', 'obdclass')
- self.add_module('lustre/ptlrpc', 'ptlrpc')
+ manager.add_portals_module("knals/gmnal", 'kgmnal')
+ if self.net_type == 'openib':
+ manager.add_portals_module("knals/openibnal", 'kopenibnal')
+ if self.net_type == 'iib':
+ manager.add_portals_module("knals/iibnal", 'kiibnal')
+ if self.net_type == 'vib':
+ self.add_portals_module("knals/vibnal", 'kvibnal')
+ if self.net_type == 'lo':
+ manager.add_portals_module("knals/lonal", 'klonal')
+ if self.net_type == 'ra':
+ manager.add_portals_module("knals/ranal", 'kranal')
+
+ def nid_to_uuid(self, nid):
+ return "NID_%s_UUID" %(nid,)
def prepare(self):
+ if not config.record and net_is_prepared():
+ return
self.info(self.net_type, self.nid, self.port)
+ if not (config.record and self.generic_nid):
+ lctl.network(self.net_type, self.nid)
if self.net_type == 'tcp':
- ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
- if ret:
- raise CommandError(TCP_ACCEPTOR, out, ret)
- ret = self.dom_node.getElementsByTagName('route_tbl')
- for a in ret:
- for r in a.getElementsByTagName('route'):
- net_type = get_attr(r, 'type')
- gw = get_attr(r, 'gw')
- lo = get_attr(r, 'lo')
- hi = get_attr(r,'hi', '')
- lctl.add_route(net_type, gw, lo, hi)
- if net_type == 'tcp' and net_type == self.net_type and hi == '':
- srv = nid2server(self.dom_node.parentNode.parentNode, lo)
- if not srv:
- panic("no server for nid", lo)
- else:
- lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
-
-
- lctl.network(self.net_type, self.nid)
- lctl.newdev(attach = "ptlrpc RPCDEV")
+ sys_tweak_socknal()
+ for hostaddr in self.db.get_hostaddr():
+ ip = string.split(hostaddr, '/')[0]
+ if len(string.split(hostaddr, '/')) == 2:
+ netmask = string.split(hostaddr, '/')[1]
+ else:
+ netmask = ""
+ lctl.add_interface(self.net_type, ip, netmask)
+ if self.net_type == 'elan':
+ sys_optimize_elan()
+ if self.port and node_is_router():
+ run_one_acceptor(self.port)
+ self.connect_peer_gateways()
+
+ def connect_peer_gateways(self):
+ for router in self.db.lookup_class('node'):
+ if router.get_val_int('router', 0):
+ for netuuid in router.get_networks():
+ net = self.db.lookup(netuuid)
+ gw = Network(net)
+ if (gw.cluster_id == self.cluster_id and
+ gw.net_type == self.net_type):
+ if gw.nid != self.nid:
+ lctl.connect(gw)
+
+ def disconnect_peer_gateways(self):
+ for router in self.db.lookup_class('node'):
+ if router.get_val_int('router', 0):
+ for netuuid in router.get_networks():
+ net = self.db.lookup(netuuid)
+ gw = Network(net)
+ if (gw.cluster_id == self.cluster_id and
+ gw.net_type == self.net_type):
+ if gw.nid != self.nid:
+ try:
+ lctl.disconnect(gw)
+ except CommandError, e:
+ print "disconnect failed: ", self.name
+ e.dump()
+ cleanup_error(e.rc)
+
+ def safe_to_clean(self):
+ return not net_is_prepared()
def cleanup(self):
self.info(self.net_type, self.nid, self.port)
- ret = self.dom_node.getElementsByTagName('route_tbl')
- for a in ret:
- for r in a.getElementsByTagName('route'):
- lo = get_attr(r, 'lo')
- hi = get_attr(r,'hi', '')
- if self.net_type == 'tcp' and hi == '':
- srv = nid2server(self.dom_node.parentNode.parentNode, lo)
- if not srv:
- panic("no server for nid", lo)
- else:
- try:
- lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
- except CommandError, e:
- print "disconnect failed: ", self.name
- e.dump()
- cleanup_error(e.rc)
+ if self.port:
+ stop_acceptor(self.port)
+ if node_is_router():
+ self.disconnect_peer_gateways()
+ if self.net_type == 'tcp':
+ for hostaddr in self.db.get_hostaddr():
+ ip = string.split(hostaddr, '/')[0]
+ lctl.del_interface(self.net_type, ip)
+
+ def correct_level(self, level, op=None):
+ return level
+
+class RouteTable(Module):
+ def __init__(self,db):
+ Module.__init__(self, 'ROUTES', db)
+
+ def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
+ lo, hi):
+ # only setup connections for tcp, openib, and iib NALs
+ srvdb = None
+ if not net_type in ('tcp','openib','iib','vib','ra'):
+ return None
+
+ # connect to target if route is to single node and this node is the gw
+ if lo == hi and local_interface(net_type, gw_cluster_id, gw):
+ if not local_cluster(net_type, tgt_cluster_id):
+ panic("target", lo, " not on the local cluster")
+ srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
+ # connect to gateway if this node is not the gw
+ elif (local_cluster(net_type, gw_cluster_id)
+ and not local_interface(net_type, gw_cluster_id, gw)):
+ srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
+ else:
+ return None
+
+ if not srvdb:
+ panic("no server for nid", lo)
+ return None
+
+ return Network(srvdb)
+
+ def prepare(self):
+ if not config.record and net_is_prepared():
+ return
+ self.info()
+ for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
+ lctl.add_route(net_type, gw, lo, hi)
+ srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
+ if srv:
+ lctl.connect(srv)
+
+ def safe_to_clean(self):
+ return not net_is_prepared()
+
+ def cleanup(self):
+ if net_is_prepared():
+ # the network is still being used, don't clean it up
+ return
+ for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
+ srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
+ if srv:
try:
- lctl.del_route(self.net_type, self.nid, lo, hi)
+ lctl.disconnect(srv)
except CommandError, e:
- print "del_route failed: ", self.name
+ print "disconnect failed: ", self.name
e.dump()
cleanup_error(e.rc)
-
- try:
- lctl.cleanup("RPCDEV", "")
- except CommandError, e:
- print "cleanup failed: ", self.name
- e.dump()
- cleanup_error(e.rc)
- try:
- lctl.disconnectAll(self.net_type)
- except CommandError, e:
- print "disconnectAll failed: ", self.name
- e.dump()
- cleanup_error(e.rc)
- if self.net_type == 'tcp':
- # yikes, this ugly! need to save pid in /var/something
- run("killall acceptor")
-class LDLM(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'LDLM', dom_node)
- self.add_module('lustre/ldlm', 'ldlm')
+ try:
+ lctl.del_route(net_type, gw, lo, hi)
+ except CommandError, e:
+ print "del_route failed: ", self.name
+ e.dump()
+ cleanup_error(e.rc)
+
+class Management(Module):
+ def __init__(self, db):
+ Module.__init__(self, 'MGMT', db)
+
+ def add_module(self, manager):
+ manager.add_lustre_module('lvfs', 'lvfs')
+ manager.add_lustre_module('obdclass', 'obdclass')
+ manager.add_lustre_module('ptlrpc', 'ptlrpc')
+ manager.add_lustre_module('mgmt', 'mgmt_svc')
+
def prepare(self):
+ if not config.record and is_prepared(self.name):
+ return
self.info()
- lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
- setup ="")
+ lctl.newdev("mgmt", self.name, self.uuid)
-class LOV(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'LOV', dom_node)
- self.mds_uuid = get_first_ref(dom_node, 'mds')
- mds= lookup(dom_node.parentNode, self.mds_uuid)
- self.mds_name = getName(mds)
- devs = dom_node.getElementsByTagName('devices')
- if len(devs) > 0:
- dev_node = devs[0]
- self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
- self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
- self.pattern = get_attr_int(dev_node, 'pattern', 0)
- self.devlist = get_all_refs(dev_node, 'osc')
- self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
- self.add_module('lustre/mdc', 'mdc')
- self.add_module('lustre/lov', 'lov')
+ def safe_to_clean(self):
+ return 1
+
+ def cleanup(self):
+ if is_prepared(self.name):
+ Module.cleanup(self)
+
+ def correct_level(self, level, op=None):
+ return level
+
+# This is only needed to load the modules; the LDLM device
+# is now created automatically.
+class LDLM(Module):
+ def __init__(self,db):
+ Module.__init__(self, 'LDLM', db)
+
+ def add_module(self, manager):
+ manager.add_lustre_module('lvfs', 'lvfs')
+ manager.add_lustre_module('obdclass', 'obdclass')
+ manager.add_lustre_module('sec', 'ptlrpcs')
+ manager.add_lustre_module('ptlrpc', 'ptlrpc')
+ manager.add_lustre_module('sec/gss', 'ptlrpcs_gss')
def prepare(self):
- for osc_uuid in self.devlist:
- osc = lookup(self.dom_node.parentNode, osc_uuid)
+ return
+
+ def cleanup(self):
+ return
+
+ def correct_level(self, level, op=None):
+ return level
+
+class LOV(Module):
+ def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
+ Module.__init__(self, 'LOV', db)
+ if name_override != None:
+ self.name = "lov_%s" % name_override
+ self.mds_uuid = self.db.get_first_ref('mds')
+ self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
+ self.stripe_off = self.db.get_val_int('stripeoffset', 0)
+ self.pattern = self.db.get_val_int('stripepattern', 0)
+ self.devlist = self.db.get_lov_tgts('lov_tgt')
+ self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
+ self.osclist = []
+ self.obdlist = []
+ self.desc_uuid = self.uuid
+ self.uuid = generate_client_uuid(self.name)
+ self.fs_name = fs_name
+ if config_only:
+ self.config_only = 1
+ return
+ self.config_only = None
+ mds = self.db.lookup(self.mds_uuid)
+ self.mds_name = mds.getName()
+ for (obd_uuid, index, gen, active) in self.devlist:
+ if obd_uuid == '':
+ continue
+ self.obdlist.append(obd_uuid)
+ obd = self.db.lookup(obd_uuid)
+ osc = get_osc(obd, self.uuid, fs_name)
if osc:
- n = OSC(osc)
- n.prepare()
+ self.osclist.append((osc, index, gen, active))
else:
- panic('osc not found:', osc_uuid)
- mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+ panic('osc not found:', obd_uuid)
+ def get_uuid(self):
+ return self.uuid
+ def get_name(self):
+ return self.name
+ def prepare(self):
+ if not config.record and is_prepared(self.name):
+ return
self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
- self.stripe_off, self.pattern, self.devlist, self.mds_name)
- lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
- setup ="%s" % (mdc_uuid))
+ self.stripe_off, self.pattern, self.devlist,
+ self.mds_name)
+ lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
+ self.stripe_sz, self.stripe_off, self.pattern,
+ string.join(self.obdlist))
+ for (osc, index, gen, active) in self.osclist:
+ target_uuid = osc.target_uuid
+ try:
+ # Only ignore connect failures with --force, which
+ # isn't implemented here yet.
+ osc.active = active
+ osc.prepare(ignore_connect_failure=0)
+ except CommandError, e:
+ print "Error preparing OSC %s\n" % osc.uuid
+ raise e
+ lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
def cleanup(self):
- for osc_uuid in self.devlist:
- osc = lookup(self.dom_node.parentNode, osc_uuid)
- if osc:
- n = OSC(osc)
- n.cleanup()
+ for (osc, index, gen, active) in self.osclist:
+ target_uuid = osc.target_uuid
+ osc.cleanup()
+ if is_prepared(self.name):
+ Module.cleanup(self)
+ if self.config_only:
+ panic("Can't clean up config_only LOV ", self.name)
+
+ def add_module(self, manager):
+ if self.config_only:
+ panic("Can't load modules for config_only LOV ", self.name)
+ for (osc, index, gen, active) in self.osclist:
+ osc.add_module(manager)
+ break
+ manager.add_lustre_module('lov', 'lov')
+
+ def correct_level(self, level, op=None):
+ return level
+
+class LMV(Module):
+ def __init__(self, db, uuid, fs_name, name_override = None):
+ Module.__init__(self, 'LMV', db)
+ if name_override != None:
+ self.name = "lmv_%s" % name_override
+
+ self.devlist = self.db.get_lmv_tgts('lmv_tgt')
+ if self.devlist == None:
+ self.devlist = self.db.get_refs('mds')
+
+ self.mdclist = []
+ self.desc_uuid = self.uuid
+ self.uuid = uuid
+ self.fs_name = fs_name
+ for mds_uuid in self.devlist:
+ mds = self.db.lookup(mds_uuid)
+ if not mds:
+ panic("MDS not found!")
+ mdc = MDC(mds, self.uuid, fs_name)
+ if mdc:
+ self.mdclist.append(mdc)
else:
- panic('osc not found:', osc_uuid)
- Module.cleanup(self)
- cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
+ panic('mdc not found:', mds_uuid)
+ def prepare(self):
+ if is_prepared(self.name):
+ return
+
+ self.info();
+ for mdc in self.mdclist:
+ try:
+ # Only ignore connect failures with --force, which
+ # isn't implemented here yet.
+ mdc.prepare(ignore_connect_failure=0)
+ except CommandError, e:
+ print "Error preparing LMV %s\n" % mdc.uuid
+ raise e
+
+ lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
+ string.join(self.devlist))
- def load_module(self):
- for osc_uuid in self.devlist:
- osc = lookup(self.dom_node.parentNode, osc_uuid)
- if osc:
- n = OSC(osc)
- n.load_module()
- break
- else:
- panic('osc not found:', osc_uuid)
- Module.load_module(self)
-
+ def cleanup(self):
+ for mdc in self.mdclist:
+ mdc.cleanup()
+ if is_prepared(self.name):
+ Module.cleanup(self)
- def cleanup_module(self):
- Module.cleanup_module(self)
- for osc_uuid in self.devlist:
- osc = lookup(self.dom_node.parentNode, osc_uuid)
- if osc:
- n = OSC(osc)
- n.cleanup_module()
- break
+ def add_module(self, manager):
+ for mdc in self.mdclist:
+ mdc.add_module(manager)
+ break
+ manager.add_lustre_module('lmv', 'lmv')
+
+ def correct_level(self, level, op=None):
+ return level
+
+class CONFDEV(Module):
+ def __init__(self, db, name, target_uuid, uuid):
+ Module.__init__(self, 'CONFDEV', db)
+ self.devpath = self.db.get_val('devpath','')
+ self.backdevpath = self.db.get_val('devpath','')
+ self.size = self.db.get_val_int('devsize', 0)
+ self.journal_size = self.db.get_val_int('journalsize', 0)
+ self.fstype = self.db.get_val('fstype', '')
+ self.backfstype = self.db.get_val('backfstype', '')
+ self.mkfsoptions = self.db.get_val('mkfsoptions', '')
+ self.mountfsoptions = self.db.get_val('mountfsoptions', '')
+ self.target = self.db.lookup(target_uuid)
+ self.name = "conf_%s" % self.target.getName()
+ self.client_uuids = self.target.get_refs('client')
+ self.obdtype = self.db.get_val('obdtype', '')
+
+ self.mds_sec = self.db.get_val('mds_sec', '')
+ self.oss_sec = self.db.get_val('oss_sec', '')
+ self.deny_sec = self.db.get_val('deny_sec', '')
+
+ if config.mds_mds_sec:
+ self.mds_sec = config.mds_mds_sec
+ if config.mds_oss_sec:
+ self.oss_sec = config.mds_oss_sec
+ if config.mds_deny_sec:
+ if self.deny_sec:
+ self.deny_sec = "%s,%s" %(self.deny_sec, config.mds_deny_sec)
+ else:
+ self.deny_sec = config.mds_deny_sec
+
+ if self.obdtype == None:
+ self.obdtype = 'dumb'
+
+ self.conf_name = name
+ self.conf_uuid = uuid
+ self.realdev = self.devpath
+
+ self.lmv = None
+ self.master = None
+
+ lmv_uuid = self.db.get_first_ref('lmv')
+ if lmv_uuid != None:
+ self.lmv = self.db.lookup(lmv_uuid)
+ if self.lmv != None:
+ self.client_uuids = self.lmv.get_refs('client')
+
+ if self.target.get_class() == 'mds':
+ if self.target.get_val('failover', 0):
+ self.failover_mds = 'f'
+ else:
+ self.failover_mds = 'n'
+ self.format = self.db.get_val('autoformat', "no")
+ else:
+ self.format = self.db.get_val('autoformat', "yes")
+ self.osdtype = self.db.get_val('osdtype')
+ ost = self.db.lookup(target_uuid)
+ if ost.get_val('failover', 0):
+ self.failover_ost = 'f'
else:
- panic('osc not found:', osc_uuid)
-
-class LOVConfig(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'LOVConfig', dom_node)
- self.lov_uuid = get_first_ref(dom_node, 'lov')
- l = lookup(dom_node.parentNode, self.lov_uuid)
- self.lov = LOV(l)
+ self.failover_ost = 'n'
+
+ self.inode_size = self.get_inode_size()
+
+ if self.lmv != None:
+ client_uuid = self.name + "_lmv_UUID"
+ self.master = LMV(self.lmv, client_uuid,
+ self.conf_name, self.conf_name)
+
+ def get_inode_size(self):
+ inode_size = self.db.get_val_int('inodesize', 0)
+ if inode_size == 0 and self.target.get_class() == 'mds':
+
+ # default inode size for case when neither LOV either
+ # LMV is accessible.
+ self.inode_size = 256
+
+ # find the LOV for this MDS
+ lovconfig_uuid = self.target.get_first_ref('lovconfig')
+ if lovconfig_uuid or self.lmv != None:
+ if self.lmv != None:
+ lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
+ lovconfig = self.lmv.lookup(lovconfig_uuid)
+ lov_uuid = lovconfig.get_first_ref('lov')
+ if lov_uuid == None:
+ panic(self.target.getName() + ": No LOV found for lovconfig ",
+ lovconfig.name)
+ else:
+ lovconfig = self.target.lookup(lovconfig_uuid)
+ lov_uuid = lovconfig.get_first_ref('lov')
+ if lov_uuid == None:
+ panic(self.target.getName() + ": No LOV found for lovconfig ",
+ lovconfig.name)
+ if self.lmv != None:
+ lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
+ lovconfig = self.lmv.lookup(lovconfig_uuid)
+ lov_uuid = lovconfig.get_first_ref('lov')
+
+ lov = LOV(self.db.lookup(lov_uuid), lov_uuid, self.name,
+ config_only = 1)
+
+ # default stripe count controls default inode_size
+ if lov.stripe_cnt > 0:
+ stripe_count = lov.stripe_cnt
+ else:
+ stripe_count = len(lov.devlist)
+ if stripe_count > 77:
+ inode_size = 4096
+ elif stripe_count > 35:
+ inode_size = 2048
+ elif stripe_count > 13:
+ inode_size = 1024
+ elif stripe_count > 3:
+ inode_size = 512
+ else:
+ inode_size = 256
+
+ return inode_size
+
+ def get_mount_options(self, blkdev):
+ options = def_mount_options(self.fstype,
+ self.target.get_class())
+
+ if config.mountfsoptions:
+ if options:
+ options = "%s,%s" %(options, config.mountfsoptions)
+ else:
+ options = config.mountfsoptions
+ if self.mountfsoptions:
+ options = "%s,%s" %(options, self.mountfsoptions)
+ else:
+ if self.mountfsoptions:
+ if options:
+ options = "%s,%s" %(options, self.mountfsoptions)
+ else:
+ options = self.mountfsoptions
+
+ if self.fstype == 'smfs':
+ if options:
+ options = "%s,type=%s,dev=%s" %(options, self.backfstype,
+ blkdev)
+ else:
+ options = "type=%s,dev=%s" %(self.backfstype,
+ blkdev)
+ if self.target.get_class() == 'mds':
+ if options:
+ options = "%s,acl,user_xattr,iopen_nopriv" %(options)
+ else:
+ options = "iopen_nopriv"
+
+ return options
+
def prepare(self):
- lov = self.lov
- self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
- lov.pattern, lov.devlist, lov.mds_name)
- lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
- lov.stripe_sz, lov.stripe_off, lov.pattern,
- string.join(lov.devlist))
+ if is_prepared(self.name):
+ return
+
+ blkdev = block_dev(self.devpath, self.size, self.fstype,
+ config.reformat, self.format, self.journal_size,
+ self.inode_size, self.mkfsoptions, self.backfstype,
+ self.backdevpath)
+
+ if self.fstype == 'smfs':
+ realdev = blkdev
+ else:
+ realdev = blkdev
+
+ mountfsoptions = self.get_mount_options(blkdev)
+
+ self.info(self.target.get_class(), realdev, mountfsoptions,
+ self.fstype, self.size, self.format)
+
+ lctl.newdev("confobd", self.name, self.uuid,
+ setup ="%s %s %s" %(realdev, self.fstype,
+ mountfsoptions))
+
+ self.mountfsoptions = mountfsoptions
+ self.realdev = realdev
+
+ def add_module(self, manager):
+ manager.add_lustre_module('obdclass', 'confobd')
+
+ def write_conf(self):
+ if self.target.get_class() == 'ost':
+ config.record = 1
+ lctl.clear_log(self.name, self.target.getName() + '-conf')
+ lctl.record(self.name, self.target.getName() + '-conf')
+ lctl.newdev(self.osdtype, self.conf_name, self.conf_uuid,
+ setup ="%s %s %s %s" %(self.realdev, self.fstype,
+ self.failover_ost,
+ self.mountfsoptions))
+ lctl.end_record()
+ lctl.clear_log(self.name, 'OSS-conf')
+ lctl.record(self.name, 'OSS-conf')
+ lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
+ lctl.end_record()
+ config.record = 0
+ return
+
+ if self.target.get_class() == 'mds':
+ if self.master != None:
+ master_name = self.master.name
+ else:
+ master_name = 'dumb'
+
+ config.record = 1
+ lctl.clear_log(self.name, self.target.getName() + '-conf')
+ lctl.record(self.name, self.target.getName() + '-conf')
+ lctl.attach("mds", self.conf_name, self.conf_uuid)
+ if self.mds_sec:
+ lctl.set_security(self.conf_name, "mds_sec", self.mds_sec)
+ if self.oss_sec:
+ lctl.set_security(self.conf_name, "oss_sec", self.oss_sec)
+ if self.deny_sec:
+ for flavor in string.split(self.deny_sec, ','):
+ lctl.set_security(self.conf_name, "deny_sec", flavor)
+ lctl.newdev("mds", self.conf_name, self.conf_uuid,
+ setup ="%s %s %s %s %s %s" %(self.realdev, self.fstype,
+ self.conf_name, self.mountfsoptions,
+ master_name, self.obdtype))
+ lctl.end_record()
+ config.record = 0
+
+ if not self.client_uuids:
+ return 0
+
+ for uuid in self.client_uuids:
+ log("recording client:", uuid)
+ client_uuid = generate_client_uuid(self.name)
+ client = VOSC(self.db.lookup(uuid), client_uuid,
+ self.target.getName(), self.name)
+ config.record = 1
+ lctl.clear_log(self.name, self.target.getName())
+ lctl.record(self.name, self.target.getName())
+ client.prepare()
+ lctl.mount_option(self.target.getName(), client.get_name(), "")
+ lctl.end_record()
+
+ config.cleanup = 1
+ lctl.clear_log(self.name, self.target.getName() + '-clean')
+ lctl.record(self.name, self.target.getName() + '-clean')
+ client.cleanup()
+ lctl.del_mount_option(self.target.getName())
+ lctl.end_record()
+ config.cleanup = 0
+ config.record = 0
+
+ if config.record:
+ return
+
+ # record logs for each client
+ if config.ldapurl:
+ config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
+ else:
+ config_options = CONFIG_FILE
+
+ for node_db in self.db.lookup_class('node'):
+ client_name = node_db.getName()
+ for prof_uuid in node_db.get_refs('profile'):
+ prof_db = node_db.lookup(prof_uuid)
+ # refactor this into a funtion to test "clientness"
+ # of a node.
+ for ref_class, ref_uuid in prof_db.get_all_refs():
+ if ref_class in ('mountpoint','echoclient'):
+ debug("recording", client_name)
+ old_noexec = config.noexec
+ config.noexec = 0
+ noexec_opt = ('', '-n')
+ ret, out = run (sys.argv[0],
+ noexec_opt[old_noexec == 1],
+ " -v --record --nomod",
+ "--record_log", client_name,
+ "--record_device", self.name,
+ "--node", client_name,
+ config_options)
+ if config.verbose:
+ for s in out: log("record> ", string.strip(s))
+ ret, out = run (sys.argv[0],
+ noexec_opt[old_noexec == 1],
+ "--cleanup -v --record --nomod",
+ "--record_log", client_name + "-clean",
+ "--record_device", self.name,
+ "--node", client_name,
+ config_options)
+ if config.verbose:
+ for s in out: log("record> ", string.strip(s))
+ config.noexec = old_noexec
+
+ def start(self):
+ try:
+ lctl.start(self.name, self.conf_name)
+ except CommandError, e:
+ raise e
+ if self.target.get_class() == 'ost':
+ if not is_prepared('OSS'):
+ try:
+ lctl.start(self.name, 'OSS')
+ except CommandError, e:
+ raise e
def cleanup(self):
- #nothing to do here
- pass
-
-
-class MDS(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'MDS', dom_node)
- self.devname, self.size = get_device(dom_node)
- self.fstype = get_text(dom_node, 'fstype')
- self.format = get_text(dom_node, 'autoformat', "no")
- if self.fstype == 'extN':
- self.add_module('lustre/extN', 'extN')
- self.add_module('lustre/mds', 'mds')
- self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
+ if is_prepared(self.name):
+ try:
+ lctl.cleanup(self.name, self.uuid, 0, 0)
+ clean_dev(self.devpath, self.fstype,
+ self.backfstype, self.backdevpath)
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+ Module.cleanup(self)
+
+class MDSDEV(Module):
+ def __init__(self,db):
+ Module.__init__(self, 'MDSDEV', db)
+ self.devpath = self.db.get_val('devpath','')
+ self.backdevpath = self.db.get_val('devpath','')
+ self.size = self.db.get_val_int('devsize', 0)
+ self.journal_size = self.db.get_val_int('journalsize', 0)
+ self.fstype = self.db.get_val('fstype', '')
+ self.backfstype = self.db.get_val('backfstype', '')
+ self.nspath = self.db.get_val('nspath', '')
+ self.mkfsoptions = self.db.get_val('mkfsoptions', '')
+ self.mountfsoptions = self.db.get_val('mountfsoptions', '')
+ self.obdtype = self.db.get_val('obdtype', '')
+ self.root_squash = self.db.get_val('root_squash', '')
+ self.no_root_squash = self.db.get_val('no_root_squash', '')
+
+ target_uuid = self.db.get_first_ref('target')
+ self.target = self.db.lookup(target_uuid)
+ self.name = self.target.getName()
+ self.master = None
+ self.lmv = None
+
+ lmv_uuid = self.db.get_first_ref('lmv')
+ if lmv_uuid != None:
+ self.lmv = self.db.lookup(lmv_uuid)
+
+ active_uuid = get_active_target(self.target)
+ if not active_uuid:
+ panic("No target device found:", target_uuid)
+ if active_uuid == self.uuid:
+ self.active = 1
+ group = self.target.get_val('group')
+ if config.group and config.group != group:
+ self.active = 0
+ else:
+ self.active = 0
+
+ self.uuid = target_uuid
+
+ # setup LMV
+ if self.lmv != None:
+ client_uuid = self.name + "_lmv_UUID"
+ self.master = LMV(self.lmv, client_uuid,
+ self.name, self.name)
+
+ self.confobd = CONFDEV(self.db, self.name,
+ target_uuid, self.uuid)
+
+ def add_module(self, manager):
+ if self.active:
+ manager.add_lustre_module('mdc', 'mdc')
+ manager.add_lustre_module('osc', 'osc')
+ manager.add_lustre_module('ost', 'ost')
+ manager.add_lustre_module('lov', 'lov')
+ manager.add_lustre_module('mds', 'mds')
+
+ if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
+ manager.add_lustre_module(self.fstype, self.fstype)
+
+ if self.fstype:
+ manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
+ # if fstype is smfs, then we should also take care about backing
+ # store fs.
+ if self.fstype == 'smfs':
+ manager.add_lustre_module(self.backfstype, self.backfstype)
+ manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
+
+ for option in string.split(self.mountfsoptions, ','):
+ if option == 'snap':
+ if not self.fstype == 'smfs':
+ panic("mountoptions has 'snap', but fstype is not smfs.")
+ manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
+ manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
+
+ # add LMV modules
+ if self.master != None:
+ self.master.add_module(manager)
+
+ # add CONFOBD modules
+ if self.confobd != None:
+ self.confobd.add_module(manager)
+
+ def write_conf(self):
+ if is_prepared(self.name):
+ return
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+ run_acceptors()
+ self.confobd.prepare()
+ self.confobd.write_conf()
+ self.confobd.cleanup()
+
def prepare(self):
- self.info(self.devname, self.fstype, self.format)
- blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
- lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
- setup ="%s %s" %(blkdev, self.fstype))
+ if is_prepared(self.name):
+ return
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+ run_acceptors()
+
+ self.confobd.prepare()
+ if config.reformat:
+ self.confobd.write_conf()
+
+ # prepare LMV
+ if self.master != None:
+ self.master.prepare()
+
+ if not config.record:
+ self.confobd.start()
+
+ if not is_prepared('MDT'):
+ lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
+
+ if development_mode():
+ procentry = "/proc/fs/lustre/mds/lsd_upcall"
+ upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall")
+ if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
+ print "MDS Warning: failed to set lsd cache upcall"
+ else:
+ run("echo ", upcall, " > ", procentry)
+
+ if config.root_squash == None:
+ config.root_squash = self.root_squash
+ if config.no_root_squash == None:
+ config.no_root_squash = self.no_root_squash
+ if config.root_squash:
+ if config.no_root_squash:
+ nsnid = config.no_root_squash
+ else:
+ nsnid = "0"
+ lctl.root_squash(self.name, config.root_squash, nsnid)
+
+ def msd_remaining(self):
+ out = lctl.device_list()
+ for s in out:
+ if string.split(s)[2] in ('mds',):
+ return 1
+
+ def safe_to_clean(self):
+ return self.active
+
+ def safe_to_clean_modules(self):
+ return not self.msd_remaining()
+
def cleanup(self):
- Module.cleanup(self)
- clean_loop(self.devname)
-
-# Very unusual case, as there is no MDC element in the XML anymore
-# Builds itself from an MDS node
-class MDC(Module):
- def __init__(self,dom_node):
- self.mds = MDS(dom_node)
- self.dom_node = dom_node
- self.module_name = 'MDC'
- self.kmodule_list = []
- self._server = None
- self._connected = 0
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+ self.info()
+ if is_prepared(self.name):
+ try:
+ lctl.cleanup(self.name, self.uuid, config.force,
+ config.failover)
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+ Module.cleanup(self)
+ # cleanup LMV
+ if self.master != None:
+ self.master.cleanup()
+ if not self.msd_remaining() and is_prepared('MDT'):
+ try:
+ lctl.cleanup("MDT", "MDT_UUID", config.force,
+ config.failover)
+ except CommandError, e:
+ print "cleanup failed: ", self.name
+ e.dump()
+ cleanup_error(e.rc)
+
+ if self.confobd:
+ self.confobd.cleanup()
+
+ def correct_level(self, level, op=None):
+ #if self.master != None:
+ # level = level + 2
+ return level
+
+class OSD(Module):
+ def __init__(self, db):
+ Module.__init__(self, 'OSD', db)
+ self.osdtype = self.db.get_val('osdtype')
+ self.devpath = self.db.get_val('devpath', '')
+ self.backdevpath = self.db.get_val('devpath', '')
+ self.size = self.db.get_val_int('devsize', 0)
+ self.journal_size = self.db.get_val_int('journalsize', 0)
+ self.inode_size = self.db.get_val_int('inodesize', 0)
+ self.mkfsoptions = self.db.get_val('mkfsoptions', '')
+ self.mountfsoptions = self.db.get_val('mountfsoptions', '')
+ self.fstype = self.db.get_val('fstype', '')
+ self.backfstype = self.db.get_val('backfstype', '')
+ self.nspath = self.db.get_val('nspath', '')
+ target_uuid = self.db.get_first_ref('target')
+ ost = self.db.lookup(target_uuid)
+ self.name = ost.getName()
+ self.format = self.db.get_val('autoformat', 'yes')
+ if ost.get_val('failover', 0):
+ self.failover_ost = 'f'
+ else:
+ self.failover_ost = 'n'
- host = socket.gethostname()
- self.name = 'MDC_'+host
- self.uuid = self.name+'_UUID'
+ self.deny_sec = self.db.get_val('deny_sec', '')
- self.lookup_server(self.mds.uuid)
- self.add_module('lustre/mdc', 'mdc')
+ if config.ost_deny_sec:
+ if self.deny_sec:
+ self.deny_sec = "%s,%s" %(self.deny_sec, config.ost_deny_sec)
+ else:
+ self.deny_sec = config.ost_deny_sec
+
+ active_uuid = get_active_target(ost)
+ if not active_uuid:
+ panic("No target device found:", target_uuid)
+ if active_uuid == self.uuid:
+ self.active = 1
+ group = ost.get_val('group')
+ if config.group and config.group != group:
+ self.active = 0
+ else:
+ self.active = 0
- def prepare(self):
- self.info(self.mds.uuid)
- srv = self.get_server()
- lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
- lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
- setup ="%s %s" %(self.mds.uuid, srv.uuid))
+ self.uuid = target_uuid
+ self.confobd = CONFDEV(self.db, self.name,
+ target_uuid, self.uuid)
+
+ def add_module(self, manager):
+ if not self.active:
+ return
+ manager.add_lustre_module('ost', 'ost')
-class OBD(Module):
- def __init__(self, dom_node):
- Module.__init__(self, 'OBD', dom_node)
- self.obdtype = get_attr(dom_node, 'type')
- self.devname, self.size = get_device(dom_node)
- self.fstype = get_text(dom_node, 'fstype')
- self.format = get_text(dom_node, 'autoformat', 'yes')
- if self.fstype == 'extN':
- self.add_module('lustre/extN', 'extN')
- self.add_module('lustre/' + self.obdtype, self.obdtype)
-
- # need to check /proc/mounts and /etc/mtab before
- # formatting anything.
- # FIXME: check if device is already formatted.
+ if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
+ manager.add_lustre_module(self.fstype, self.fstype)
+
+ if self.fstype:
+ manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
+
+ if self.fstype == 'smfs':
+ manager.add_lustre_module(self.backfstype, self.backfstype)
+ manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
+
+ for option in self.mountfsoptions:
+ if option == 'snap':
+ if not self.fstype == 'smfs':
+ panic("mountoptions with snap, but fstype is not smfs\n")
+ manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
+ manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
+
+ manager.add_lustre_module(self.osdtype, self.osdtype)
+
+ # add CONFOBD modules
+ if self.confobd != None:
+ self.confobd.add_module(manager)
+
def prepare(self):
- self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
- if self.obdtype == 'obdecho':
- blkdev = ''
+ if is_prepared(self.name):
+ return
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+
+ run_acceptors()
+
+ if self.osdtype == 'obdecho':
+ self.info(self.osdtype)
+ lctl.newdev("obdecho", self.name, self.uuid)
+ if not is_prepared('OSS'):
+ lctl.newdev("ost", 'OSS', 'OSS_UUID', setup="")
+ else:
+ self.confobd.prepare()
+ if config.reformat:
+ self.confobd.write_conf()
+ if not config.record:
+ self.confobd.start()
+
+ if self.deny_sec:
+ for flavor in string.split(self.deny_sec, ','):
+ lctl.set_security(self.name, "deny_sec", flavor)
+
+ def write_conf(self):
+ if is_prepared(self.name):
+ return
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+
+ run_acceptors()
+ if self.osdtype != 'obdecho':
+ self.confobd.prepare()
+ self.confobd.write_conf()
+ if not config.write_conf:
+ self.confobd.start()
+ self.confobd.cleanup()
+
+ def osd_remaining(self):
+ out = lctl.device_list()
+ for s in out:
+ if string.split(s)[2] in ('obdfilter', 'obdecho'):
+ return 1
+
+ def safe_to_clean(self):
+ return self.active
+
+ def safe_to_clean_modules(self):
+ return not self.osd_remaining()
+
+ def cleanup(self):
+ if not self.active:
+ debug(self.uuid, "not active")
+ return
+
+ if is_prepared(self.name):
+ self.info()
+ try:
+ lctl.cleanup(self.name, self.uuid, config.force,
+ config.failover)
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+ if not self.osd_remaining() and is_prepared('OSS'):
+ try:
+ lctl.cleanup("OSS", "OSS_UUID", config.force,
+ config.failover)
+ except CommandError, e:
+ print "cleanup failed: ", self.name
+ e.dump()
+ cleanup_error(e.rc)
+
+ if self.osdtype != 'obdecho':
+ if self.confobd:
+ self.confobd.cleanup()
+
+ def correct_level(self, level, op=None):
+ return level
+
+# Generic client module, used by OSC and MDC
+class Client(Module):
+ def __init__(self, tgtdb, uuid, module, fs_name,
+ self_name=None, module_dir=None):
+ self.target_name = tgtdb.getName()
+ self.target_uuid = tgtdb.getUUID()
+ self.module_dir = module_dir
+ self.backup_targets = []
+ self.module = module
+ self.db = tgtdb
+
+ self.tgt_dev_uuid = get_active_target(tgtdb)
+ if not self.tgt_dev_uuid:
+ panic("No target device found for target(1):", self.target_name)
+
+ self._server = None
+ self._connected = 0
+
+ self.module = module
+ self.module_name = string.upper(module)
+ if not self_name:
+ self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
+ self.target_name, fs_name)
else:
- blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
- lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
- setup ="%s %s" %(blkdev, self.fstype))
+ self.name = self_name
+ self.uuid = uuid
+ self.lookup_server(self.tgt_dev_uuid)
+ self.lookup_backup_targets()
+ self.fs_name = fs_name
+ if not self.module_dir:
+ self.module_dir = module
+
+ def add_module(self, manager):
+ manager.add_lustre_module(self.module_dir, self.module)
+
+ def lookup_server(self, srv_uuid):
+ """ Lookup a server's network information """
+ self._server_nets = get_ost_net(self.db, srv_uuid)
+ if len(self._server_nets) == 0:
+ panic ("Unable to find a server for:", srv_uuid)
+
+ def get_name(self):
+ return self.name
+
+ def get_servers(self):
+ return self._server_nets
+
+ def lookup_backup_targets(self):
+ """ Lookup alternative network information """
+ prof_list = toplustreDB.get_refs('profile')
+ for prof_uuid in prof_list:
+ prof_db = toplustreDB.lookup(prof_uuid)
+ if not prof_db:
+ panic("profile:", prof_uuid, "not found.")
+ for ref_class, ref_uuid in prof_db.get_all_refs():
+ if ref_class in ('osd', 'mdsdev'):
+ devdb = toplustreDB.lookup(ref_uuid)
+ uuid = devdb.get_first_ref('target')
+ if self.target_uuid == uuid and self.tgt_dev_uuid != ref_uuid:
+ self.backup_targets.append(ref_uuid)
+
+ def prepare(self, ignore_connect_failure = 0):
+ self.info(self.target_uuid)
+ if not config.record and is_prepared(self.name):
+ self.cleanup()
+ try:
+ srv = choose_local_server(self.get_servers())
+ if srv:
+ lctl.connect(srv)
+ else:
+ routes = find_route(self.get_servers())
+ if len(routes) == 0:
+ panic ("no route to", self.target_uuid)
+ for (srv, r) in routes:
+ lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
+ except CommandError, e:
+ if not ignore_connect_failure:
+ raise e
+
+ if srv:
+ if self.target_uuid in config.inactive and self.permits_inactive():
+ debug("%s inactive" % self.target_uuid)
+ inactive_p = "inactive"
+ else:
+ debug("%s active" % self.target_uuid)
+ inactive_p = ""
+ lctl.newdev(self.module, self.name, self.uuid,
+ setup ="%s %s %s" % (self.target_uuid, srv.nid_uuid,
+ inactive_p))
+ for tgt_dev_uuid in self.backup_targets:
+ this_nets = get_ost_net(toplustreDB, tgt_dev_uuid)
+ if len(this_nets) == 0:
+ panic ("Unable to find a server for:", tgt_dev_uuid)
+ srv = choose_local_server(this_nets)
+ if srv:
+ lctl.connect(srv)
+ else:
+ routes = find_route(this_nets);
+ if len(routes) == 0:
+ panic("no route to", tgt_dev_uuid)
+ for (srv, r) in routes:
+ lctl.add_route_host(r[0]. srv.nid_uuid, r[1], r[3])
+ if srv:
+ lctl.add_conn(self.name, srv.nid_uuid);
+
def cleanup(self):
- Module.cleanup(self)
- if not self.obdtype == 'obdecho':
- clean_loop(self.devname)
+ if is_prepared(self.name):
+ Module.cleanup(self)
+ try:
+ srv = choose_local_server(self.get_servers())
+ if srv:
+ lctl.disconnect(srv)
+ else:
+ for (srv, r) in find_route(self.get_servers()):
+ lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
+ except CommandError, e:
+ log(self.module_name, "cleanup failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+
+ for tgt_dev_uuid in self.backup_targets:
+ this_net = get_ost_net(toplustreDB, tgt_dev_uuid)
+ srv = choose_local_server(this_net)
+ if srv:
+ lctl.disconnect(srv)
+ else:
+ for (srv, r) in find_route(this_net):
+ lctl.del_route_host(r[0]. srv.nid_uuid, r[1], r[3])
+
+ def correct_level(self, level, op=None):
+ return level
+
+ def deactivate(self):
+ try:
+ lctl.deactivate(self.name)
+ except CommandError, e:
+ log(self.module_name, "deactivate failed: ", self.name)
+ e.dump()
+ cleanup_error(e.rc)
+
+class MDC(Client):
+ def __init__(self, db, uuid, fs_name):
+ Client.__init__(self, db, uuid, 'mdc', fs_name)
-class OST(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'OST', dom_node)
- self.obd_uuid = get_first_ref(dom_node, 'obd')
- self.add_module('lustre/ost', 'ost')
+ def permits_inactive(self):
+ return 0
+
+class OSC(Client):
+ def __init__(self, db, uuid, fs_name):
+ Client.__init__(self, db, uuid, 'osc', fs_name)
+
+ def permits_inactive(self):
+ return 1
+
+class CMOBD(Module):
+ def __init__(self, db):
+ Module.__init__(self, 'CMOBD', db)
+ self.name = self.db.getName();
+ self.uuid = generate_client_uuid(self.name)
+ self.master_uuid = self.db.get_first_ref('masterobd')
+ self.cache_uuid = self.db.get_first_ref('cacheobd')
+
+ master_obd = self.db.lookup(self.master_uuid)
+ if not master_obd:
+ panic('master obd not found:', self.master_uuid)
+
+ cache_obd = self.db.lookup(self.cache_uuid)
+ if not cache_obd:
+ panic('cache obd not found:', self.cache_uuid)
+
+ self.master = None
+ self.cache = None
+
+ master_class = master_obd.get_class()
+ cache_class = cache_obd.get_class()
+
+ if master_class == 'ost' or master_class == 'lov':
+ client_uuid = "%s_lov_master_UUID" % (self.name)
+ self.master = LOV(master_obd, client_uuid, self.name);
+ elif master_class == 'mds':
+ self.master = get_mdc(db, self.name, self.master_uuid)
+ elif master_class == 'lmv':
+ #tmp fix: cobd and cmobd will use same uuid, so use const name here
+ client_uuid = "%s_lmv_master_UUID" % "master"
+ self.master = LMV(master_obd, client_uuid, self.name);
+ else:
+ panic("unknown master obd class '%s'" %(master_class))
+
+ if cache_class == 'ost' or cache_class == 'lov':
+ client_uuid = "%s_lov_cache_UUID" % (self.name)
+ self.cache = LOV(cache_obd, client_uuid, self.name);
+ elif cache_class == 'mds':
+ self.cache = get_mdc(db, self.name, self.cache_uuid)
+ elif cache_class == 'lmv':
+ client_uuid = "%s_lmv_cache_UUID" % (self.name)
+ self.cache = LMV(cache_obd, client_uuid, self.name);
+ else:
+ panic("unknown cache obd class '%s'" %(cache_class))
def prepare(self):
- self.info(self.obd_uuid)
- lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
- setup ="%s" % (self.obd_uuid))
+ self.master.prepare()
+ if not config.record and is_prepared(self.name):
+ return
+ self.info(self.master_uuid, self.cache_uuid)
+ lctl.newdev("cmobd", self.name, self.uuid,
+ setup ="%s %s" %(self.master.uuid,
+ self.cache.uuid))
+
+ def get_uuid(self):
+ return self.uuid
+
+ def get_name(self):
+ return self.name
+
+ def get_master_name(self):
+ return self.master.name
+
+ def get_cache_name(self):
+ return self.cache.name
+ def cleanup(self):
+ if is_prepared(self.name):
+ Module.cleanup(self)
+ if self.master:
+ self.master.cleanup()
+
+ def add_module(self, manager):
+ manager.add_lustre_module('smfs', 'smfs')
+ manager.add_lustre_module('cmobd', 'cmobd')
+ self.master.add_module(manager)
+
+ def correct_level(self, level, op=None):
+ return level
+
+class COBD(Module):
+ def __init__(self, db, uuid, name):
+ Module.__init__(self, 'COBD', db)
+ self.name = self.db.getName();
+ self.uuid = generate_client_uuid(self.name)
+ self.master_uuid = self.db.get_first_ref('masterobd')
+ self.cache_uuid = self.db.get_first_ref('cacheobd')
+
+ master_obd = self.db.lookup(self.master_uuid)
+ if not master_obd:
+ panic('master obd not found:', self.master_uuid)
+
+ cache_obd = self.db.lookup(self.cache_uuid)
+ if not cache_obd:
+ panic('cache obd not found:', self.cache_uuid)
+
+ self.master = None
+ self.cache = None
+
+ master_class = master_obd.get_class()
+ cache_class = cache_obd.get_class()
+
+ if master_class == 'ost' or master_class == 'lov':
+ client_uuid = "%s_lov_master_UUID" % (self.name)
+ self.master = LOV(master_obd, client_uuid, name);
+ elif master_class == 'mds':
+ self.master = get_mdc(db, name, self.master_uuid)
+ elif master_class == 'lmv':
+ #tmp fix: cobd and cmobd will use same uuid, so use const name here
+ client_uuid = "%s_lmv_master_UUID" % "master"
+ self.master = LMV(master_obd, client_uuid, self.name);
+ else:
+ panic("unknown master obd class '%s'" %(master_class))
+
+ if cache_class == 'ost' or cache_class == 'lov':
+ client_uuid = "%s_lov_cache_UUID" % (self.name)
+ self.cache = LOV(cache_obd, client_uuid, name);
+ elif cache_class == 'mds':
+ self.cache = get_mdc(db, name, self.cache_uuid)
+ elif cache_class == 'lmv':
+ client_uuid = "%s_lmv_cache_UUID" % "cache"
+ self.cache = LMV(cache_obd, client_uuid, self.name);
+ else:
+ panic("unknown cache obd class '%s'" %(cache_class))
+
+ def get_uuid(self):
+ return self.uuid
+
+ def get_name(self):
+ return self.name
+
+ def get_master_name(self):
+ return self.master.name
+
+ def get_cache_name(self):
+ return self.cache.name
+
+ def prepare(self):
+ if not config.record and is_prepared(self.name):
+ return
+ self.master.prepare()
+ self.cache.prepare()
+ self.info(self.master_uuid, self.cache_uuid)
+ lctl.newdev("cobd", self.name, self.uuid,
+ setup ="%s %s" %(self.master.name,
+ self.cache.name))
+
+ def cleanup(self):
+ if is_prepared(self.name):
+ Module.cleanup(self)
+ self.master.cleanup()
+ self.cache.cleanup()
+
+ def add_module(self, manager):
+ manager.add_lustre_module('cobd', 'cobd')
+ self.master.add_module(manager)
# virtual interface for OSC and LOV
class VOSC(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'VOSC', dom_node)
- if dom_node.nodeName == 'lov':
- self.osc = LOV(dom_node)
+ def __init__(self, db, client_uuid, name, name_override = None):
+ Module.__init__(self, 'VOSC', db)
+ if db.get_class() == 'lov':
+ self.osc = LOV(db, client_uuid, name, name_override)
+ self.type = 'lov'
+ elif db.get_class() == 'cobd':
+ self.osc = COBD(db, client_uuid, name)
+ self.type = 'cobd'
else:
- self.osc = OSC(dom_node)
+ self.osc = OSC(db, client_uuid, name)
+ self.type = 'osc'
+
+ def get_uuid(self):
+ return self.osc.get_uuid()
+
+ def get_name(self):
+ return self.osc.get_name()
+
def prepare(self):
self.osc.prepare()
+
def cleanup(self):
self.osc.cleanup()
- def load_module(self):
- self.osc.load_module()
- def cleanup_module(self):
- self.osc.cleanup_module()
-
+
+ def add_module(self, manager):
+ self.osc.add_module(manager)
+
+ def correct_level(self, level, op=None):
+ return self.osc.correct_level(level, op)
+
+# virtual interface for MDC and LMV
+class VMDC(Module):
+ def __init__(self, db, client_uuid, name, name_override = None):
+ Module.__init__(self, 'VMDC', db)
+ if db.get_class() == 'lmv':
+ self.mdc = LMV(db, client_uuid, name, name_override)
+ elif db.get_class() == 'cobd':
+ self.mdc = COBD(db, client_uuid, name)
+ else:
+ self.mdc = MDC(db, client_uuid, name)
+
+ def get_uuid(self):
+ return self.mdc.uuid
-class OSC(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'OSC', dom_node)
- self.obd_uuid = get_first_ref(dom_node, 'obd')
- self.ost_uuid = get_first_ref(dom_node, 'ost')
- self.lookup_server(self.ost_uuid)
- self.add_module('lustre/osc', 'osc')
+ def get_name(self):
+ return self.mdc.name
def prepare(self):
- self.info(self.obd_uuid, self.ost_uuid)
- srv = self.get_server()
- if local_net(srv):
- lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
- else:
- r = find_route(srv)
- if r:
- lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
- else:
- panic ("no route to", srv.nid)
-
- lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
- setup ="%s %s" %(self.obd_uuid, srv.uuid))
+ self.mdc.prepare()
+
+ def cleanup(self):
+ self.mdc.cleanup()
+
+ def add_module(self, manager):
+ self.mdc.add_module(manager)
+
+ def correct_level(self, level, op=None):
+ return self.mdc.correct_level(level, op)
+
+class ECHO_CLIENT(Module):
+ def __init__(self,db):
+ Module.__init__(self, 'ECHO_CLIENT', db)
+ self.obd_uuid = self.db.get_first_ref('obd')
+ obd = self.db.lookup(self.obd_uuid)
+ self.uuid = generate_client_uuid(self.name)
+ self.osc = VOSC(obd, self.uuid, self.name)
+
+ def prepare(self):
+ if not config.record and is_prepared(self.name):
+ return
+ run_acceptors()
+ self.osc.prepare() # XXX This is so cheating. -p
+ self.info(self.obd_uuid)
+
+ lctl.newdev("echo_client", self.name, self.uuid,
+ setup = self.osc.get_name())
def cleanup(self):
- srv = self.get_server()
- if local_net(srv):
- Module.cleanup(self)
- else:
- self.info(self.obd_uuid, self.ost_uuid)
- r = find_route(srv)
- if r:
- try:
- lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
- except CommandError, e:
- print "del_route failed: ", self.name
- e.dump()
- cleanup_error(e.rc)
+ if is_prepared(self.name):
Module.cleanup(self)
-
+ self.osc.cleanup()
-class Mountpoint(Module):
- def __init__(self,dom_node):
- Module.__init__(self, 'MTPT', dom_node)
- self.path = get_text(dom_node, 'path')
- self.mds_uuid = get_first_ref(dom_node, 'mds')
- self.lov_uuid = get_first_ref(dom_node, 'osc')
- self.add_module('lustre/mdc', 'mdc')
- self.add_module('lustre/llite', 'llite')
- l = lookup(self.dom_node.parentNode, self.lov_uuid)
- self.osc = VOSC(l)
+ def add_module(self, manager):
+ self.osc.add_module(manager)
+ manager.add_lustre_module('obdecho', 'obdecho')
+
+ def correct_level(self, level, op=None):
+ return level
+
+def generate_client_uuid(name):
+ client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
+ name,
+ int(random.random() * 1048576),
+ int(random.random() * 1048576))
+ return client_uuid[:36]
+class Mountpoint(Module):
+ def __init__(self,db):
+ Module.__init__(self, 'MTPT', db)
+ self.path = self.db.get_val('path')
+ self.clientoptions = self.db.get_val('clientoptions', '')
+ self.fs_uuid = self.db.get_first_ref('filesystem')
+ fs = self.db.lookup(self.fs_uuid)
+ self.mds_uuid = fs.get_first_ref('lmv')
+ if not self.mds_uuid:
+ self.mds_uuid = fs.get_first_ref('mds')
+ self.obd_uuid = fs.get_first_ref('obd')
+ client_uuid = generate_client_uuid(self.name)
+
+ self.oss_sec = self.db.get_val('oss_sec','null')
+ self.mds_sec = self.db.get_val('mds_sec','null')
+ if config.mds_sec:
+ self.mds_sec = config.mds_sec
+ if config.oss_sec:
+ self.oss_sec = config.oss_sec
+
+ ost = self.db.lookup(self.obd_uuid)
+ if not ost:
+ panic("no ost: ", self.obd_uuid)
+
+ mds = self.db.lookup(self.mds_uuid)
+ if not mds:
+ panic("no mds: ", self.mds_uuid)
+
+ self.vosc = VOSC(ost, client_uuid, self.name, self.name)
+ self.vmdc = VMDC(mds, client_uuid, self.name, self.name)
+
def prepare(self):
- self.osc.prepare()
- mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
+ if not config.record and fs_is_mounted(self.path):
+ log(self.path, "already mounted.")
+ return
+ run_acceptors()
+
+ self.vosc.prepare()
+ self.vmdc.prepare()
+
+ self.info(self.path, self.mds_uuid, self.obd_uuid)
+ if config.record or config.lctl_dump:
+ lctl.mount_option(local_node_name, self.vosc.get_name(),
+ self.vmdc.get_name())
+ return
- self.info(self.path, self.mds_uuid,self.lov_uuid)
- cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
- (self.lov_uuid, mdc_uuid, self.path)
+ if config.clientoptions:
+ if self.clientoptions:
+ self.clientoptions = self.clientoptions + ',' + config.clientoptions
+ else:
+ self.clientoptions = config.clientoptions
+ if self.clientoptions:
+ self.clientoptions = ',' + self.clientoptions
+ # Linux kernel will deal with async and not pass it to ll_fill_super,
+ # so replace it with Lustre async
+ self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
+
+ cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,mds_sec=%s,oss_sec=%s%s %s %s" % \
+ (self.vosc.get_name(), self.vmdc.get_name(), self.mds_sec,
+ self.oss_sec, self.clientoptions, config.config, self.path)
run("mkdir", self.path)
ret, val = run(cmd)
if ret:
- panic("mount failed:", self.path)
+ self.vmdc.cleanup()
+ self.vosc.cleanup()
+ panic("mount failed:", self.path, ":", string.join(val))
def cleanup(self):
- self.info(self.path, self.mds_uuid,self.lov_uuid)
- if config.force():
- (rc, out) = run("umount -f", self.path)
- else:
- (rc, out) = run("umount", self.path)
- if rc:
- log("umount failed, cleanup will most likely not work.")
- l = lookup(self.dom_node.parentNode, self.lov_uuid)
- self.osc.cleanup()
- cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
-
- def load_module(self):
- self.osc.load_module()
- Module.load_module(self)
- def cleanup_module(self):
- Module.cleanup_module(self)
- self.osc.cleanup_module()
+ self.info(self.path, self.mds_uuid,self.obd_uuid)
+ if config.record or config.lctl_dump:
+ lctl.del_mount_option(local_node_name)
+ else:
+ if fs_is_mounted(self.path):
+ if config.force:
+ (rc, out) = run("umount", "-f", self.path)
+ else:
+ (rc, out) = run("umount", self.path)
+ if rc:
+ raise CommandError('umount', out, rc)
-# ============================================================
-# XML processing and query
-# TODO: Change query funcs to use XPath, which is muc cleaner
-
-def get_device(obd):
- list = obd.getElementsByTagName('device')
- if len(list) > 0:
- dev = list[0]
- dev.normalize();
- size = get_attr_int(dev, 'size', 0)
- return dev.firstChild.data, size
- return '', 0
-
-# Get the text content from the first matching child
-# If there is no content (or it is all whitespace), return
-# the default
-def get_text(dom_node, tag, default=""):
- list = dom_node.getElementsByTagName(tag)
- if len(list) > 0:
- dom_node = list[0]
- dom_node.normalize()
- if dom_node.firstChild:
- txt = string.strip(dom_node.firstChild.data)
- if txt:
- return txt
- return default
-
-def get_text_int(dom_node, tag, default=0):
- list = dom_node.getElementsByTagName(tag)
- n = default
- if len(list) > 0:
- dom_node = list[0]
- dom_node.normalize()
- if dom_node.firstChild:
- txt = string.strip(dom_node.firstChild.data)
- if txt:
- try:
- n = int(txt)
- except ValueError:
- panic("text value is not integer:", txt)
- return n
-
-def get_attr(dom_node, attr, default=""):
- v = dom_node.getAttribute(attr)
- if v:
- return v
- return default
-
-def get_attr_int(dom_node, attr, default=0):
- n = default
- v = dom_node.getAttribute(attr)
- if v:
- try:
- n = int(v)
- except ValueError:
- panic("attr value is not integer", v)
- return n
+ if fs_is_mounted(self.path):
+ panic("fs is still mounted:", self.path)
-def get_first_ref(dom_node, tag):
- """ Get the first uuidref of the type TAG. Used one only
- one is expected. Returns the uuid."""
- uuid = None
- refname = '%s_ref' % tag
- list = dom_node.getElementsByTagName(refname)
- if len(list) > 0:
- uuid = getRef(list[0])
- return uuid
-
-def get_all_refs(dom_node, tag):
- """ Get all the refs of type TAG. Returns list of uuids. """
- uuids = []
- refname = '%s_ref' % tag
- list = dom_node.getElementsByTagName(refname)
- if len(list) > 0:
- for i in list:
- uuids.append(getRef(i))
- return uuids
-
-def get_ost_net(dom_node, uuid):
- ost = lookup(dom_node, uuid)
- uuid = get_first_ref(ost, 'network')
- if not uuid:
- return None
- return lookup(dom_node, uuid)
-
-def nid2server(dom_node, nid):
- netlist = dom_node.getElementsByTagName('network')
- for net_node in netlist:
- if get_text(net_node, 'server') == nid:
- return Network(net_node)
- return None
-
-def lookup(dom_node, uuid):
- for n in dom_node.childNodes:
- if n.nodeType == n.ELEMENT_NODE:
- if getUUID(n) == uuid:
- return n
- else:
- n = lookup(n, uuid)
- if n: return n
- return None
-
-# Get name attribute of dom_node
-def getName(dom_node):
- return dom_node.getAttribute('name')
+ self.vmdc.cleanup()
+ self.vosc.cleanup()
-def getRef(dom_node):
- return dom_node.getAttribute('uuidref')
+ def add_module(self, manager):
+ self.vosc.add_module(manager)
+ self.vmdc.add_module(manager)
+ manager.add_lustre_module('llite', 'llite')
-# Get name attribute of dom_node
-def getUUID(dom_node):
- return dom_node.getAttribute('uuid')
+ def correct_level(self, level, op=None):
+ return level
-# the tag name is the service type
-# fixme: this should do some checks to make sure the dom_node is a service
-def getServiceType(dom_node):
- return dom_node.nodeName
+# ============================================================
+# misc query functions
+
+def get_ost_net(self, osd_uuid):
+ srv_list = []
+ if not osd_uuid:
+ return srv_list
+ osd = self.lookup(osd_uuid)
+ node_uuid = osd.get_first_ref('node')
+ node = self.lookup(node_uuid)
+ if not node:
+ panic("unable to find node for osd_uuid:", osd_uuid,
+ " node_ref:", node_uuid_)
+ for net_uuid in node.get_networks():
+ db = node.lookup(net_uuid)
+ srv_list.append(Network(db))
+ return srv_list
-#
-# determine what "level" a particular node is at.
# the order of iniitailization is based on level.
-def getServiceLevel(dom_node):
- type = getServiceType(dom_node)
+def getServiceLevel(self):
+ type = self.get_class()
+ ret=0;
if type in ('network',):
- return 10
- elif type in ('device', 'ldlm'):
- return 20
- elif type in ('obd', 'mdd'):
- return 30
- elif type in ('mds','ost'):
- return 40
- elif type in ('mdc','osc'):
- return 50
- elif type in ('lov', 'lovconfig'):
- return 60
- elif type in ('mountpoint',):
- return 70
- return 0
+ ret = 5
+ elif type in ('routetbl',):
+ ret = 6
+ elif type in ('ldlm',):
+ ret = 20
+ elif type in ('osd', 'cobd'):
+ ret = 30
+ elif type in ('mdsdev',):
+ ret = 40
+ elif type in ('lmv',):
+ ret = 45
+ elif type in ('mountpoint', 'echoclient'):
+ ret = 60
+ elif type in ('cmobd',):
+ ret = 70
+ else:
+ panic("Unknown type: ", type)
+
+ if ret < config.minlevel or ret > config.maxlevel:
+ ret = 0
+ return ret
#
# return list of services in a profile. list is a list of tuples
-# [(level, dom_node),]
-def getServices(lustreNode, profileNode):
+# [(level, db_object),]
+def getServices(self):
list = []
- for n in profileNode.childNodes:
- if n.nodeType == n.ELEMENT_NODE:
- servNode = lookup(lustreNode, getRef(n))
- if not servNode:
- print n
- panic('service not found: ' + getRef(n))
- level = getServiceLevel(servNode)
- list.append((level, servNode))
+ for ref_class, ref_uuid in self.get_all_refs():
+ servdb = self.lookup(ref_uuid)
+ if servdb:
+ level = getServiceLevel(servdb)
+ if level > 0:
+ list.append((level, servdb))
+ else:
+ panic('service not found: ' + ref_uuid)
+
list.sort()
return list
-def getByName(lustreNode, name, tag):
- ndList = lustreNode.getElementsByTagName(tag)
- for nd in ndList:
- if getName(nd) == name:
- return nd
- return None
-
############################################################
-# MDC UUID hack -
+# MDC UUID hack -
# FIXME: clean this mess up!
#
-mdc_uuid = None
-def prepare_mdc(dom_node, mds_uuid):
- global mdc_uuid
- mds_node = lookup(dom_node, mds_uuid);
- if not mds_node:
- panic("no mds:", mds_uuid)
- if mdc_uuid:
- return mdc_uuid
- mdc = MDC(mds_node)
- mdc.prepare()
- mdc_uuid = mdc.uuid
- return mdc_uuid
-
-mdc_cleaned = None
-def cleanup_mdc(dom_node, mds_uuid):
- global mdc_cleaned
- mds_node = lookup(dom_node, mds_uuid);
- if not mds_node:
- panic("no mds:", mds_uuid)
- if not mdc_cleaned:
- mdc = MDC(mds_node)
- mdc.cleanup()
- mdc_uuid = None
- mdc_cleaned = 'yes'
-
+# OSC is no longer in the xml, so we have to fake it.
+# this is getting ugly and begging for another refactoring
+def get_osc(ost_db, uuid, fs_name):
+ osc = OSC(ost_db, uuid, fs_name)
+ return osc
+
+def get_mdc(db, fs_name, mds_uuid):
+ mds_db = db.lookup(mds_uuid);
+ if not mds_db:
+ error("no mds:", mds_uuid)
+ mdc = MDC(mds_db, mds_uuid, fs_name)
+ return mdc
############################################################
# routing ("rooting")
-#
-routes = []
-local_node = []
-router_flag = 0
-
-def init_node(dom_node):
- global local_node, router_flag
- netlist = dom_node.getElementsByTagName('network')
- for dom_net in netlist:
- type = get_attr(dom_net, 'type')
- gw = get_text(dom_net, 'server')
- local_node.append((type, gw))
+# list of (nettype, cluster_id, nid)
+local_clusters = []
+
+def find_local_clusters(node_db):
+ global local_clusters
+ for netuuid in node_db.get_networks():
+ net = node_db.lookup(netuuid)
+ srv = Network(net)
+ debug("add_local", netuuid)
+ local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
+ if srv.port > 0:
+ if not acceptors.has_key(srv.port):
+ acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
+
+# This node is a gateway.
+is_router = 0
+def node_is_router():
+ return is_router
+
+# If there are any routers found in the config, then this will be true
+# and all nodes will load kptlrouter.
+needs_router = 0
def node_needs_router():
- return router_flag
-
-def get_routes(type, gw, dom_net):
- """ Return the routes as a list of tuples of the form:
- [(type, gw, lo, hi),]"""
- res = []
- tbl = dom_net.getElementsByTagName('route_tbl')
- for t in tbl:
- routes = t.getElementsByTagName('route')
- for r in routes:
- lo = get_attr(r, 'lo')
- hi = get_attr(r, 'hi', '')
- res.append((type, gw, lo, hi))
- return res
-
+ return needs_router or is_router
+
+# list of (nettype, gw, tgt_cluster_id, lo, hi)
+# Currently, these local routes are only added to kptlrouter route
+# table if they are needed to connect to a specific server. This
+# should be changed so all available routes are loaded, and the
+# ptlrouter can make all the decisions.
+local_routes = []
-def init_route_config(lustre):
- """ Scan the lustre config looking for routers. Build list of
+def find_local_routes(lustre):
+ """ Scan the lustre config looking for routers . Build list of
routes. """
- global routes, router_flag
- routes = []
- list = lustre.getElementsByTagName('node')
- for node in list:
- if get_attr(node, 'router'):
- router_flag = 1
- for (local_type, local_nid) in local_node:
+ global local_routes, needs_router
+ local_routes = []
+ list = lustre.lookup_class('node')
+ for router in list:
+ if router.get_val_int('router', 0):
+ needs_router = 1
+ for (local_type, local_cluster_id, local_nid) in local_clusters:
gw = None
- netlist = node.getElementsByTagName('network')
- for dom_net in netlist:
- if local_type == get_attr(dom_net, 'type'):
- gw = get_text(dom_net, 'server')
+ for netuuid in router.get_networks():
+ db = router.lookup(netuuid)
+ if (local_type == db.get_val('nettype') and
+ local_cluster_id == db.get_val('clusterid')):
+ gw = db.get_val('nid')
break
- if not gw:
- continue
- for dom_net in netlist:
- if local_type != get_attr(dom_net, 'type'):
- for route in get_routes(local_type, gw, dom_net):
- routes.append(route)
-
+ if gw:
+ debug("find_local_routes: gw is", gw)
+ for route in router.get_local_routes(local_type, gw):
+ local_routes.append(route)
+ debug("find_local_routes:", local_routes)
+
-def local_net(net):
- global local_node
- for iface in local_node:
- if net.net_type == iface[0]:
+def choose_local_server(srv_list):
+ for srv in srv_list:
+ if local_cluster(srv.net_type, srv.cluster_id):
+ return srv
+
+def local_cluster(net_type, cluster_id):
+ for cluster in local_clusters:
+ if net_type == cluster[0] and cluster_id == cluster[1]:
return 1
return 0
-def find_route(net):
- global local_node, routes
- frm_type = local_node[0][0]
- to_type = net.net_type
- to = net.nid
- debug ('looking for route to', to_type,to)
- for r in routes:
- if r[2] == to:
- return r
- return None
+def local_interface(net_type, cluster_id, nid):
+ for cluster in local_clusters:
+ if (net_type == cluster[0] and cluster_id == cluster[1]
+ and nid == cluster[2]):
+ return 1
+ return 0
+
+def find_route(srv_list):
+ result = []
+ frm_type = local_clusters[0][0]
+ for srv in srv_list:
+ debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
+ to_type = srv.net_type
+ to = srv.nid
+ cluster_id = srv.cluster_id
+ debug ('looking for route to', to_type, to)
+ for r in local_routes:
+ debug("find_route: ", r)
+ if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
+ result.append((srv, r))
+ return result
-
+def get_active_target(db):
+ target_uuid = db.getUUID()
+ target_name = db.getName()
+ node_name = get_select(target_name)
+ if node_name:
+ tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
+ else:
+ tgt_dev_uuid = db.get_first_ref('active')
+ return tgt_dev_uuid
+
+def get_server_by_nid_uuid(db, nid_uuid):
+ for n in db.lookup_class("network"):
+ net = Network(n)
+ if net.nid_uuid == nid_uuid:
+ return net
############################################################
# lconf level logic
# Start a service.
-def startService(dom_node, module_flag):
- type = getServiceType(dom_node)
- debug('Service:', type, getName(dom_node), getUUID(dom_node))
- # there must be a more dynamic way of doing this...
+def newService(db):
+ type = db.get_class()
+ debug('Service:', type, db.getName(), db.getUUID())
n = None
if type == 'ldlm':
- n = LDLM(dom_node)
+ n = LDLM(db)
elif type == 'lov':
- n = LOV(dom_node)
- elif type == 'lovconfig':
- n = LOVConfig(dom_node)
+ n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
elif type == 'network':
- n = Network(dom_node)
- elif type == 'obd':
- n = OBD(dom_node)
- elif type == 'ost':
- n = OST(dom_node)
- elif type == 'mds':
- n = MDS(dom_node)
- elif type == 'osc':
- n = VOSC(dom_node)
- elif type == 'mdc':
- n = MDC(dom_node)
+ n = Network(db)
+ elif type == 'routetbl':
+ n = RouteTable(db)
+ elif type == 'osd':
+ n = OSD(db)
+ elif type == 'cobd':
+ n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
+ elif type == 'cmobd':
+ n = CMOBD(db)
+ elif type == 'mdsdev':
+ n = MDSDEV(db)
elif type == 'mountpoint':
- n = Mountpoint(dom_node)
+ n = Mountpoint(db)
+ elif type == 'echoclient':
+ n = ECHO_CLIENT(db)
+ elif type == 'lmv':
+ n = LMV(db)
else:
panic ("unknown service type:", type)
-
- if module_flag:
- if config.nomod():
- return
- if config.cleanup():
- n.cleanup_module()
- else:
- n.load_module()
- else:
- if config.nosetup():
- return
- if config.cleanup():
- n.cleanup()
- else:
- n.prepare()
+ return n
#
# Prepare the system to run lustre using a particular profile
# * make sure partitions are in place and prepared
# * initialize devices with lctl
# Levels is important, and needs to be enforced.
-def startProfile(lustreNode, profileNode, module_flag):
- if not profileNode:
- panic("profile:", profile, "not found.")
- services = getServices(lustreNode, profileNode)
- if config.cleanup():
- services.reverse()
+def for_each_profile(db, prof_list, operation):
+ for prof_uuid in prof_list:
+ prof_db = db.lookup(prof_uuid)
+ if not prof_db:
+ panic("profile:", prof_uuid, "not found.")
+ services = getServices(prof_db)
+ operation(services)
+
+def magic_get_osc(db, rec, lov):
+ if lov:
+ lov_uuid = lov.get_uuid()
+ lov_name = lov.osc.fs_name
+ else:
+ lov_uuid = rec.getAttribute('lov_uuidref')
+ # FIXME: better way to find the mountpoint?
+ filesystems = db.root_node.getElementsByTagName('filesystem')
+ fsuuid = None
+ for fs in filesystems:
+ ref = fs.getElementsByTagName('obd_ref')
+ if ref[0].getAttribute('uuidref') == lov_uuid:
+ fsuuid = fs.getAttribute('uuid')
+ break
+
+ if not fsuuid:
+ panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
+
+ mtpts = db.root_node.getElementsByTagName('mountpoint')
+ lov_name = None
+ for fs in mtpts:
+ ref = fs.getElementsByTagName('filesystem_ref')
+ if ref[0].getAttribute('uuidref') == fsuuid:
+ lov_name = fs.getAttribute('name')
+ break
+
+ if not lov_name:
+ panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
+
+ print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
+
+ ost_uuid = rec.getAttribute('ost_uuidref')
+ obd = db.lookup(ost_uuid)
+
+ if not obd:
+ panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
+
+ osc = get_osc(obd, lov_uuid, lov_name)
+ if not osc:
+ panic('osc not found:', obd_uuid)
+ return osc
+
+# write logs for update records. sadly, logs of all types -- and updates in
+# particular -- are something of an afterthought. lconf needs rewritten with
+# these as core concepts. so this is a pretty big hack.
+def process_update_record(db, update, lov):
+ for rec in update.childNodes:
+ if rec.nodeType != rec.ELEMENT_NODE:
+ continue
+
+ log("found "+rec.nodeName+" record in update version " +
+ str(update.getAttribute('version')))
+
+ lov_uuid = rec.getAttribute('lov_uuidref')
+ ost_uuid = rec.getAttribute('ost_uuidref')
+ index = rec.getAttribute('index')
+ gen = rec.getAttribute('generation')
+
+ if not lov_uuid or not ost_uuid or not index or not gen:
+ panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
+
+ if not lov:
+ tmplov = db.lookup(lov_uuid)
+ if not tmplov:
+ panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
+ lov_name = tmplov.getName()
+ else:
+ lov_name = lov.osc.name
+
+ # ------------------------------------------------------------- add
+ if rec.nodeName == 'add':
+ if config.cleanup:
+ lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
+ continue
+
+ osc = magic_get_osc(db, rec, lov)
+
+ try:
+ # Only ignore connect failures with --force, which
+ # isn't implemented here yet.
+ osc.prepare(ignore_connect_failure=0)
+ except CommandError, e:
+ print "Error preparing OSC %s\n" % osc.uuid
+ raise e
+
+ lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
+
+ # ------------------------------------------------------ deactivate
+ elif rec.nodeName == 'deactivate':
+ if config.cleanup:
+ continue
+
+ osc = magic_get_osc(db, rec, lov)
+
+ try:
+ osc.deactivate()
+ except CommandError, e:
+ print "Error deactivating OSC %s\n" % osc.uuid
+ raise e
+
+ # ---------------------------------------------------------- delete
+ elif rec.nodeName == 'delete':
+ if config.cleanup:
+ continue
+
+ osc = magic_get_osc(db, rec, lov)
+
+ try:
+ config.cleanup = 1
+ osc.cleanup()
+ config.cleanup = 0
+ except CommandError, e:
+ print "Error cleaning up OSC %s\n" % osc.uuid
+ raise e
+
+ lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
+
+def process_updates(db, log_device, log_name, lov = None):
+ updates = db.root_node.getElementsByTagName('update')
+ for u in updates:
+ if not u.childNodes:
+ log("ignoring empty update record (version " +
+ str(u.getAttribute('version')) + ")")
+ continue
+
+ version = u.getAttribute('version')
+ real_name = "%s-%s" % (log_name, version)
+ lctl.clear_log(log_device, real_name)
+ lctl.record(log_device, real_name)
+
+ process_update_record(db, u, lov)
+
+ lctl.end_record()
+
+def doWriteconf(services):
+ #if config.nosetup:
+ # return
+ for s in services:
+ if s[1].get_class() == 'mdsdev' or s[1].get_class() == 'osd':
+ n = newService(s[1])
+ n.write_conf()
+ n.cleanup()
+
+def doSetup(services):
+ if config.nosetup:
+ return
+ slist = []
+ for s in services:
+ n = newService(s[1])
+ n.level = s[0]
+ slist.append((n.level, n))
+ nlist = []
+ for n in slist:
+ nl = n[1].correct_level(n[0])
+ nlist.append((nl, n[1]))
+ nlist.sort()
+ for n in nlist:
+ n[1].prepare()
+
+def doLoadModules(services):
+ if config.nomod:
+ return
+
+ # adding all needed modules from all services
+ for s in services:
+ n = newService(s[1])
+ n.add_module(mod_manager)
+
+ # loading all registered modules
+ mod_manager.load_modules()
+
+def doUnloadModules(services):
+ if config.nomod:
+ return
+
+ # adding all needed modules from all services
for s in services:
- startService(s[1], module_flag)
+ n = newService(s[1])
+ if n.safe_to_clean_modules():
+ n.add_module(mod_manager)
+
+ # unloading all registered modules
+ mod_manager.cleanup_modules()
+
+def doCleanup(services):
+ if config.nosetup:
+ return
+ slist = []
+ for s in services:
+ n = newService(s[1])
+ n.level = s[0]
+ slist.append((n.level, n))
+ nlist = []
+ for n in slist:
+ nl = n[1].correct_level(n[0])
+ nlist.append((nl, n[1]))
+ nlist.sort()
+ nlist.reverse()
+
+ for n in nlist:
+ if n[1].safe_to_clean():
+ n[1].cleanup()
#
# Load profile for
-def doHost(lustreNode, hosts):
- global routes
- dom_node = None
+def doHost(lustreDB, hosts):
+ global is_router, local_node_name
+ node_db = None
for h in hosts:
- dom_node = getByName(lustreNode, h, 'node')
- if dom_node:
+ node_db = lustreDB.lookup_name(h, 'node')
+ if node_db:
break
+ if not node_db:
+ panic('No host entry found.')
+
+ local_node_name = node_db.get_val('name', 0)
+ is_router = node_db.get_val_int('router', 0)
+ lustre_upcall = node_db.get_val('lustreUpcall', '')
+ portals_upcall = node_db.get_val('portalsUpcall', '')
+ timeout = node_db.get_val_int('timeout', 0)
+ ptldebug = node_db.get_val('ptldebug', '')
+ subsystem = node_db.get_val('subsystem', '')
+
+ find_local_clusters(node_db)
+ if not is_router:
+ find_local_routes(lustreDB)
- if not dom_node:
- print 'No host entry found.'
- return
+ # Two step process: (1) load modules, (2) setup lustre
+ # if not cleaning, load modules first.
+ prof_list = node_db.get_refs('profile')
+
+ if config.write_conf:
+ for_each_profile(node_db, prof_list, doLoadModules)
+ sys_make_devices()
+ for_each_profile(node_db, prof_list, doWriteconf)
+ for_each_profile(node_db, prof_list, doUnloadModules)
+ lustreDB.close()
+
+ elif config.recover:
+ if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
+ raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
+ "--client_uuid <UUID> --conn_uuid <UUID>")
+ doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
+ config.conn_uuid)
+ elif config.cleanup:
+ if config.force:
+ # the command line can override this value
+ timeout = 5
+ # ugly hack, only need to run lctl commands for --dump
+ if config.lctl_dump or config.record:
+ for_each_profile(node_db, prof_list, doCleanup)
+ return
+
+ sys_set_timeout(timeout)
+ sys_set_ptldebug(ptldebug)
+ sys_set_subsystem(subsystem)
+ sys_set_lustre_upcall(lustre_upcall)
+ sys_set_portals_upcall(portals_upcall)
+
+ for_each_profile(node_db, prof_list, doCleanup)
+ for_each_profile(node_db, prof_list, doUnloadModules)
+ lustreDB.close()
- if not get_attr(dom_node, 'router'):
- init_node(dom_node)
- init_route_config(lustreNode)
else:
- global router_flag
- router_flag = 1
+ # ugly hack, only need to run lctl commands for --dump
+ if config.lctl_dump or config.record:
+ sys_set_timeout(timeout)
+ sys_set_lustre_upcall(lustre_upcall)
+ for_each_profile(node_db, prof_list, doSetup)
+ return
- # Two step process: (1) load modules, (2) setup lustre
- # if not cleaning, load modules first.
- module_flag = not config.cleanup()
- reflist = dom_node.getElementsByTagName('profile')
- for profile in reflist:
- startProfile(lustreNode, profile, module_flag)
+ sys_make_devices()
+ sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
+ sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
+
+ for_each_profile(node_db, prof_list, doLoadModules)
- if not config.cleanup():
sys_set_debug_path()
- script = config.gdb_script()
+ sys_set_ptldebug(ptldebug)
+ sys_set_subsystem(subsystem)
+ script = config.gdb_script
run(lctl.lctl, ' modules >', script)
- if config.gdb():
- # dump /tmp/ogdb and sleep/pause here
+ if config.gdb:
log ("The GDB module script is in", script)
+ # pause, so user has time to break and
+ # load the script
time.sleep(5)
-
- module_flag = not module_flag
- for profile in reflist:
- startProfile(lustreNode, profile, module_flag)
-
-############################################################
-# Command line processing
-#
-def parse_cmdline(argv):
- short_opts = "hdnvf"
- long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
- "portals=", "makeldiff", "cleanup", "noexec",
- "help", "node=", "nomod", "nosetup",
- "dump=", "force"]
- opts = []
- args = []
+ sys_set_timeout(timeout)
+ sys_set_lustre_upcall(lustre_upcall)
+ sys_set_portals_upcall(portals_upcall)
+
+ for_each_profile(node_db, prof_list, doSetup)
+ lustreDB.close()
+
+def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
+ tgt = lustreDB.lookup(tgt_uuid)
+ if not tgt:
+ raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
+ new_uuid = get_active_target(tgt)
+ if not new_uuid:
+ raise Lustre.LconfError("doRecovery: no active target found for: " +
+ tgt_uuid)
+ net = choose_local_server(get_ost_net(lustreDB, new_uuid))
+ if not net:
+ raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
+
+ log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
try:
- opts, args = getopt.getopt(argv, short_opts, long_opts)
- except getopt.error:
- print "invalid opt"
- usage()
-
- for o, a in opts:
- if o in ("-h", "--help"):
- usage()
- if o in ("-d","--cleanup"):
- config.cleanup(1)
- if o in ("-v", "--verbose"):
- config.verbose(1)
- if o in ("-n", "--noexec"):
- config.noexec(1)
- config.verbose(1)
- if o == "--portals":
- config.portals = a
- if o == "--lustre":
- config.lustre = a
- if o == "--reformat":
- config.reformat(1)
- if o == "--node":
- config.node(a)
- if o == "--gdb":
- config.gdb(1)
- if o == "--nomod":
- config.nomod(1)
- if o == "--nosetup":
- config.nosetup(1)
- if o == "--dump":
- config.dump_file(a)
- if o in ("-f", "--force"):
- config.force(1)
- return args
-
-def fetch(url):
- import urllib
- data = ""
+ oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
+ lustreDB.close()
+ if oldnet:
+ lctl.disconnect(oldnet)
+ except CommandError, e:
+ log("recover: disconnect", nid_uuid, "failed: ")
+ e.dump()
+
try:
- s = urllib.urlopen(url)
- data = s.read()
- except:
- usage()
- return data
+ lctl.connect(net)
+ except CommandError, e:
+ log("recover: connect failed")
+ e.dump()
-def setupModulePath(cmd):
- base = os.path.dirname(cmd)
- if os.access(base+"/Makefile", os.R_OK):
- config.src_dir(base + "/../../")
+ lctl.recover(client_uuid, net.nid_uuid)
-def sys_set_debug_path():
- debug("debug path: ", config.debug_path())
- if config.noexec():
+
+def setupModulePath(cmd, portals_dir = PORTALS_DIR):
+ base = os.path.dirname(cmd)
+ if development_mode():
+ if not config.lustre:
+ debug('using objdir module paths')
+ config.lustre = (os.path.join(base, ".."))
+ # normalize the portals dir, using command line arg if set
+ if config.portals:
+ portals_dir = config.portals
+ dir = os.path.join(config.lustre, portals_dir)
+ config.portals = dir
+ debug('config.portals', config.portals)
+ elif config.lustre and config.portals:
+ # production mode
+ # if --lustre and --portals, normalize portals
+ # can ignore POTRALS_DIR here, since it is probly useless here
+ config.portals = os.path.join(config.lustre, config.portals)
+ debug('config.portals B', config.portals)
+
+def sysctl(path, val):
+ debug("+ sysctl", path, val)
+ if config.noexec:
return
try:
- fp = open('/proc/sys/portals/debug_path', 'w')
- fp.write(config.debug_path())
+ fp = open(os.path.join('/proc/sys', path), 'w')
+ fp.write(str(val))
fp.close()
except IOError, e:
- print e
-
-#/proc/sys/net/core/rmem_max
-#/proc/sys/net/core/wmem_max
+ panic(str(e))
+
+
+def sys_set_debug_path():
+ sysctl('portals/debug_path', config.debug_path)
+
+def sys_set_lustre_upcall(upcall):
+ # the command overrides the value in the node config
+ if config.lustre_upcall:
+ upcall = config.lustre_upcall
+ elif config.upcall:
+ upcall = config.upcall
+ if upcall:
+ lctl.set_lustre_upcall(upcall)
+
+def sys_set_portals_upcall(upcall):
+ # the command overrides the value in the node config
+ if config.portals_upcall:
+ upcall = config.portals_upcall
+ elif config.upcall:
+ upcall = config.upcall
+ if upcall:
+ sysctl('portals/upcall', upcall)
+
+def sys_set_timeout(timeout):
+ # the command overrides the value in the node config
+ if config.timeout and config.timeout > 0:
+ timeout = config.timeout
+ if timeout != None and timeout > 0:
+ lctl.set_timeout(timeout)
+
+def sys_tweak_socknal ():
+ # reserve at least 8MB, or we run out of RAM in skb_alloc under read
+ if sys_get_branch() == '2.6':
+ fp = open('/proc/meminfo')
+ lines = fp.readlines()
+ fp.close()
+ memtotal = 131072
+ for l in lines:
+ a = string.split(l)
+ if a[0] == 'MemTotal:':
+ memtotal = a[1]
+ debug("memtotal" + memtotal)
+ if int(memtotal) < 262144:
+ minfree = int(memtotal) / 16
+ else:
+ minfree = 32768
+ debug("+ minfree ", minfree)
+ sysctl("vm/min_free_kbytes", minfree)
+ if config.single_socket:
+ sysctl("socknal/typed", 0)
+
+def sys_optimize_elan ():
+ procfiles = ["/proc/elan/config/eventint_punt_loops",
+ "/proc/qsnet/elan3/config/eventint_punt_loops",
+ "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
+ for p in procfiles:
+ if os.access(p, os.W_OK):
+ run ("echo 1 > " + p)
+
+def sys_set_ptldebug(ptldebug):
+ if config.ptldebug:
+ ptldebug = config.ptldebug
+ if ptldebug:
+ try:
+ val = eval(ptldebug, ptldebug_names)
+ val = "0x%x" % (val & 0xffffffffL)
+ sysctl('portals/debug', val)
+ except NameError, e:
+ panic(str(e))
+
+def sys_set_subsystem(subsystem):
+ if config.subsystem:
+ subsystem = config.subsystem
+ if subsystem:
+ try:
+ val = eval(subsystem, subsystem_names)
+ val = "0x%x" % (val & 0xffffffffL)
+ sysctl('portals/subsystem_debug', val)
+ except NameError, e:
+ panic(str(e))
+
def sys_set_netmem_max(path, max):
debug("setting", path, "to at least", max)
- if config.noexec():
+ if config.noexec:
return
fp = open(path)
str = fp.readline()
- fp.close
+ fp.close()
cur = int(str)
if max > cur:
fp = open(path, 'w')
fp.write('%d\n' %(max))
fp.close()
-
def sys_make_devices():
if not os.access('/dev/portals', os.R_OK):
run('mknod /dev/portals c 10 240')
if not os.access('/dev/obd', os.R_OK):
run('mknod /dev/obd c 10 241')
-
# Add dir to the global PATH, if not already there.
def add_to_path(new_dir):
syspath = string.split(os.environ['PATH'], ':')
return
os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
+def default_debug_path():
+ path = '/tmp/lustre-log'
+ if os.path.isdir('/r'):
+ return '/r' + path
+ else:
+ return path
+
+def default_gdb_script():
+ script = '/tmp/ogdb'
+ if os.path.isdir('/r'):
+ return '/r' + script
+ else:
+ return script
DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
# ensure basic elements are in the system path
for dir in DEFAULT_PATH:
add_to_path(dir)
-# Initialize or shutdown lustre according to a configuration file
-# * prepare the system for lustre
-# * configure devices with lctl
-# Shutdown does steps in reverse
-#
+# global hack for the --select handling
+tgt_select = {}
+def init_select(args):
+ # args = [service=nodeA,service2=nodeB service3=nodeC]
+ global tgt_select
+ for arg in args:
+ list = string.split(arg, ',')
+ for entry in list:
+ srv, node = string.split(entry, '=')
+ tgt_select[srv] = node
+
+def get_select(srv):
+ if tgt_select.has_key(srv):
+ return tgt_select[srv]
+ return None
+
+
+FLAG = Lustre.Options.FLAG
+PARAM = Lustre.Options.PARAM
+INTPARAM = Lustre.Options.INTPARAM
+PARAMLIST = Lustre.Options.PARAMLIST
+lconf_options = [
+ ('verbose,v', "Print system commands as they are run"),
+ ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
+ ('config', "Cluster config name used for LDAP query", PARAM),
+ ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
+ ('node', "Load config for <nodename>", PARAM),
+ ('sec',"security flavor <null|krb5i|krb5p> between this client with mds", PARAM),
+ ('mds_sec',"security flavor <null|krb5i|krb5p> between this client with mds", PARAM),
+ ('oss_sec',"security flavor <null|krb5i|krb5p> between this client with ost", PARAM),
+ ('mds_mds_sec',"security flavor <null|krb5i|krb5p> between this mds with other mds", PARAM),
+ ('mds_oss_sec',"security flavor <null|krb5i|krb5p> between this mds with ost", PARAM),
+ ('mds_deny_sec', "security flavor <null|krb5i|krb5p> denied by this mds", PARAM),
+ ('ost_deny_sec', "security flavor <null|krb5i|krb5p> denied by this ost", PARAM),
+ ('cleanup,d', "Cleans up config. (Shutdown)"),
+ ('force,f', "Forced unmounting and/or obd detach during cleanup",
+ FLAG, 0),
+ ('single_socket', "socknal option: only use one socket instead of bundle",
+ FLAG, 0),
+ ('failover',"""Used to shut down without saving state.
+ This will allow this node to "give up" a service to a
+ another node for failover purposes. This will not
+ be a clean shutdown.""",
+ FLAG, 0),
+ ('gdb', """Prints message after creating gdb module script
+ and sleeps for 5 seconds."""),
+ ('noexec,n', """Prints the commands and steps that will be run for a
+ config without executing them. This can used to check if a
+ config file is doing what it should be doing"""),
+ ('nomod', "Skip load/unload module step."),
+ ('nosetup', "Skip device setup/cleanup step."),
+ ('reformat', "Reformat all devices (without question)"),
+ ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
+ ('mountfsoptions', "Additional options for mount fs command line", PARAM),
+ ('clientoptions', "Additional options for Lustre", PARAM),
+ ('dump', "Dump the kernel debug log to file before portals is unloaded",
+ PARAM),
+ ('write_conf', "Save all the client config information on mds."),
+ ('record', "Write config information on mds."),
+ ('record_log', "Name of config record log.", PARAM),
+ ('record_device', "MDS device name that will record the config commands",
+ PARAM),
+ ('root_squash', "MDS squash root to appointed uid",
+ PARAM),
+ ('no_root_squash', "Don't squash root for appointed nid",
+ PARAM),
+ ('minlevel', "Minimum level of services to configure/cleanup",
+ INTPARAM, 0),
+ ('maxlevel', """Maximum level of services to configure/cleanup
+ Levels are aproximatly like:
+ 10 - netwrk
+ 20 - device, ldlm
+ 30 - osd, mdd
+ 40 - mds, ost
+ 70 - mountpoint, echo_client, osc, mdc, lov""",
+ INTPARAM, 100),
+ ('lustre', """Base directory of lustre sources. This parameter will
+ cause lconf to load modules from a source tree.""", PARAM),
+ ('portals', """Portals source directory. If this is a relative path,
+ then it is assumed to be relative to lustre. """, PARAM),
+ ('timeout', "Set recovery timeout", INTPARAM),
+ ('upcall', "Set both portals and lustre upcall script", PARAM),
+ ('lustre_upcall', "Set lustre upcall script", PARAM),
+ ('portals_upcall', "Set portals upcall script", PARAM),
+ ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
+ ('ptldebug', "Set the portals debug level", PARAM),
+ ('subsystem', "Set the portals debug subsystem", PARAM),
+ ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
+ ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
+# Client recovery options
+ ('recover', "Recover a device"),
+ ('group', "The group of devices to configure or cleanup", PARAM),
+ ('tgt_uuid', "The failed target (required for recovery)", PARAM),
+ ('client_uuid', "The failed client (required for recovery)", PARAM),
+ ('conn_uuid', "The failed connection (required for recovery)", PARAM),
+
+ ('inactive', """The name of an inactive service, to be ignored during
+ mounting (currently OST-only). Can be repeated.""",
+ PARAMLIST),
+ ]
+
def main():
- global TCP_ACCEPTOR, lctl, MAXTCPBUF
+ global lctl, config, toplustreDB, CONFIG_FILE, mod_manager
+
+ # in the upcall this is set to SIG_IGN
+ signal.signal(signal.SIGCHLD, signal.SIG_DFL)
+
+ cl = Lustre.Options("lconf", "config.xml", lconf_options)
+ try:
+ config, args = cl.parse(sys.argv[1:])
+ except Lustre.OptionError, e:
+ print e
+ sys.exit(1)
+
+ setupModulePath(sys.argv[0])
+
host = socket.gethostname()
+ # the PRNG is normally seeded with time(), which is not so good for starting
+ # time-synchronized clusters
+ input = open('/dev/urandom', 'r')
+ if not input:
+ print 'Unable to open /dev/urandom!'
+ sys.exit(1)
+ seed = input.read(32)
+ input.close()
+ random.seed(seed)
+
sanitise_path()
+
+ init_select(config.select)
- args = parse_cmdline(sys.argv[1:])
if len(args) > 0:
- if not os.access(args[0], os.R_OK):
+ # allow config to be fetched via HTTP, but only with python2
+ if sys.version[0] != '1' and args[0].startswith('http://'):
+ import urllib2
+ try:
+ config_file = urllib2.urlopen(args[0])
+ except (urllib2.URLError, socket.error), err:
+ if hasattr(err, 'args'):
+ err = err.args[1]
+ print "Could not access '%s': %s" %(args[0], err)
+ sys.exit(1)
+ elif not os.access(args[0], os.R_OK):
print 'File not found or readable:', args[0]
sys.exit(1)
- dom = xml.dom.minidom.parse(args[0])
- elif config.url():
- xmldata = fetch(config.url())
- dom = xml.dom.minidom.parseString(xmldata)
+ else:
+ # regular file
+ config_file = open(args[0], 'r')
+ try:
+ dom = xml.dom.minidom.parse(config_file)
+ except Exception:
+ panic("%s does not appear to be a config file." % (args[0]))
+ sys.exit(1) # make sure to die here, even in debug mode.
+ config_file.close()
+ CONFIG_FILE = args[0]
+ lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
+ if not config.config:
+ config.config = os.path.basename(args[0])# use full path?
+ if config.config[-4:] == '.xml':
+ config.config = config.config[:-4]
+ elif config.ldapurl:
+ if not config.config:
+ panic("--ldapurl requires --config name")
+ dn = "config=%s,fs=lustre" % (config.config)
+ lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
+ elif config.ptldebug or config.subsystem:
+ sys_set_ptldebug(None)
+ sys_set_subsystem(None)
+ sys.exit(0)
else:
- usage()
+ print 'Missing config file or ldap URL.'
+ print 'see lconf --help for command summary'
+ sys.exit(1)
+
+ toplustreDB = lustreDB
+
+ ver = lustreDB.get_version()
+ if not ver:
+ panic("No version found in config data, please recreate.")
+ if ver != Lustre.CONFIG_VERSION:
+ panic("Config version", ver, "does not match lconf version",
+ Lustre.CONFIG_VERSION)
node_list = []
- if config.node():
- node_list.append(config.node())
+ if config.node:
+ node_list.append(config.node)
else:
if len(host) > 0:
node_list.append(host)
node_list.append('localhost')
+
debug("configuring for host: ", node_list)
if len(host) > 0:
- config._debug_path = config._debug_path + '-' + host
- config._gdb_script = config._gdb_script + '-' + host
-
- TCP_ACCEPTOR = find_prog('acceptor')
- if not TCP_ACCEPTOR:
- if config.noexec():
- TCP_ACCEPTOR = 'acceptor'
- debug('! acceptor not found')
- else:
- panic('acceptor not found')
+ config.debug_path = config.debug_path + '-' + host
+ config.gdb_script = config.gdb_script + '-' + host
lctl = LCTLInterface('lctl')
- setupModulePath(sys.argv[0])
- sys_make_devices()
- sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
- sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
- doHost(dom.documentElement, node_list)
+ if config.lctl_dump:
+ lctl.use_save_file(config.lctl_dump)
+
+ if config.record:
+ if not (config.record_device and config.record_log):
+ panic("When recording, both --record_log and --record_device must be specified.")
+ lctl.clear_log(config.record_device, config.record_log)
+ lctl.record(config.record_device, config.record_log)
+
+ # init module manager
+ mod_manager = kmod_manager(config.lustre, config.portals)
+
+ doHost(lustreDB, node_list)
+
+ if not config.record:
+ return
+
+ lctl.end_record()
+
+ process_updates(lustreDB, config.record_device, config.record_log)
if __name__ == "__main__":
try:
main()
- except LconfError, e:
+ except Lustre.LconfError, e:
print e
+# traceback.print_exc(file=sys.stdout)
+ sys.exit(1)
except CommandError, e:
e.dump()
sys.exit(e.rc)
if first_cleanup_error:
sys.exit(first_cleanup_error)
-