3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = '../portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
91 "console" : (1 << 25),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
107 "pinger" : (1 << 12),
108 "filter" : (1 << 13),
113 "ptlrouter" : (1 << 18),
117 "confobd" : (1 << 22),
123 first_cleanup_error = 0
124 def cleanup_error(rc):
125 global first_cleanup_error
126 if not first_cleanup_error:
127 first_cleanup_error = rc
129 # ============================================================
130 # debugging and error funcs
132 def fixme(msg = "this feature"):
133 raise Lustre.LconfError, msg + ' not implemented yet.'
136 msg = string.join(map(str,args))
137 if not config.noexec:
138 raise Lustre.LconfError(msg)
143 msg = string.join(map(str,args))
148 print string.strip(s)
152 msg = string.join(map(str,args))
155 # ack, python's builtin int() does not support '0x123' syntax.
156 # eval can do it, although what a hack!
160 return eval(s, {}, {})
163 except SyntaxError, e:
164 raise ValueError("not a number")
166 raise ValueError("not a number")
168 # ============================================================
169 # locally defined exceptions
170 class CommandError (exceptions.Exception):
171 def __init__(self, cmd_name, cmd_err, rc=None):
172 self.cmd_name = cmd_name
173 self.cmd_err = cmd_err
178 if type(self.cmd_err) == types.StringType:
180 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
182 print "! %s: %s" % (self.cmd_name, self.cmd_err)
183 elif type(self.cmd_err) == types.ListType:
185 print "! %s (error %d):" % (self.cmd_name, self.rc)
187 print "! %s:" % (self.cmd_name)
188 for s in self.cmd_err:
189 print "> %s" %(string.strip(s))
194 # ============================================================
195 # handle daemons, like the acceptor
197 """ Manage starting and stopping a daemon. Assumes daemon manages
198 it's own pid file. """
200 def __init__(self, cmd):
206 log(self.command, "already running.")
208 self.path = find_prog(self.command)
210 panic(self.command, "not found.")
211 ret, out = runcmd(self.path +' '+ self.command_line())
213 # FIXME: add this check can only narrow the race but can not avoid it
214 # completely, so I don't apply this method on inserting module.
215 if ret and not self.running():
216 raise CommandError(self.path, out, ret)
220 pid = self.read_pidfile()
222 log ("killing process", pid)
224 #time.sleep(1) # let daemon die
226 log("unable to kill", self.command, e)
228 log("unable to kill", self.command)
231 pid = self.read_pidfile()
241 def read_pidfile(self):
243 fp = open(self.pidfile(), 'r')
248 print "WARNING: invalid pid in %s, removed" % self.pidfile()
249 print "WARNING: You may need to stop acceptor by yourself"
250 os.unlink(self.pidfile())
255 def clean_pidfile(self):
256 """ Remove a stale pidfile """
257 log("removing stale pidfile:", self.pidfile())
259 os.unlink(self.pidfile())
261 log(self.pidfile(), e)
263 class AcceptorHandler(DaemonHandler):
264 def __init__(self, port, net_type):
265 DaemonHandler.__init__(self, "acceptor")
267 self.net_type = net_type
271 return "/var/run/%s-%d.pid" % (self.command, self.port)
273 def command_line(self):
274 return string.join(map(str,(self.flags, self.port)))
278 # start the acceptors
280 if config.lctl_dump or config.record:
282 for port in acceptors.keys():
283 daemon = acceptors[port]
284 if daemon.net_type == 'tcp' and not daemon.running():
287 def run_one_acceptor(port):
288 if config.lctl_dump or config.record:
290 if acceptors.has_key(port):
291 daemon = acceptors[port]
292 if daemon.net_type == 'tcp' and not daemon.running():
295 panic("run_one_acceptor: No acceptor defined for port:", port)
297 def stop_acceptor(port):
298 if acceptors.has_key(port):
299 daemon = acceptors[port]
300 if daemon.net_type == 'tcp' and daemon.running():
304 # ============================================================
305 # handle lctl interface
308 Manage communication with lctl
311 def __init__(self, cmd):
313 Initialize close by finding the lctl binary.
315 self.lctl = find_prog(cmd)
317 self.record_device = ''
320 debug('! lctl not found')
323 raise CommandError('lctl', "unable to find lctl binary.")
325 def use_save_file(self, file):
326 self.save_file = file
328 def record(self, dev_name, logname):
329 log("Recording log", logname, "on", dev_name)
330 self.record_device = dev_name
331 self.record_log = logname
333 def end_record(self):
334 log("End recording log", self.record_log, "on", self.record_device)
335 self.record_device = None
336 self.record_log = None
338 def set_nonblock(self, fd):
339 fl = fcntl.fcntl(fd, F_GETFL)
340 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
345 the cmds are written to stdin of lctl
346 lctl doesn't return errors when run in script mode, so
348 should modify command line to accept multiple commands, or
349 create complex command line options
353 cmds = '\n dump ' + self.save_file + '\n' + cmds
354 elif self.record_device:
358 %s""" % (self.record_device, self.record_log, cmds)
360 debug("+", cmd_line, cmds)
361 if config.noexec: return (0, [])
363 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
364 child.tochild.write(cmds + "\n")
365 child.tochild.close()
367 # From "Python Cookbook" from O'Reilly
368 outfile = child.fromchild
369 outfd = outfile.fileno()
370 self.set_nonblock(outfd)
371 errfile = child.childerr
372 errfd = errfile.fileno()
373 self.set_nonblock(errfd)
375 outdata = errdata = ''
378 ready = select.select([outfd,errfd],[],[]) # Wait for input
379 if outfd in ready[0]:
380 outchunk = outfile.read()
381 if outchunk == '': outeof = 1
382 outdata = outdata + outchunk
383 if errfd in ready[0]:
384 errchunk = errfile.read()
385 if errchunk == '': erreof = 1
386 errdata = errdata + errchunk
387 if outeof and erreof: break
388 # end of "borrowed" code
391 if os.WIFEXITED(ret):
392 rc = os.WEXITSTATUS(ret)
395 if rc or len(errdata):
396 raise CommandError(self.lctl, errdata, rc)
399 def runcmd(self, *args):
401 run lctl using the command line
403 cmd = string.join(map(str,args))
404 debug("+", self.lctl, cmd)
405 rc, out = run(self.lctl, cmd)
407 raise CommandError(self.lctl, out, rc)
411 def clear_log(self, dev, log):
412 """ clear an existing log """
417 quit """ % (dev, log)
420 def network(self, net, nid):
425 quit """ % (net, nid)
429 def add_interface(self, net, ip, netmask = ""):
430 """ add an interface """
434 quit """ % (net, ip, netmask)
437 # delete an interface
438 def del_interface(self, net, ip):
439 """ delete an interface """
446 # create a new connection
447 def add_uuid(self, net_type, uuid, nid):
448 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
451 def add_peer(self, net_type, nid, hostaddr, port):
452 if net_type in ('tcp','openib','ra') and not config.lctl_dump:
457 nid, hostaddr, port )
459 elif net_type in ('iib',) and not config.lctl_dump:
466 elif net_type in ('vib',) and not config.lctl_dump:
474 def connect(self, srv):
475 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
476 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
478 hostaddr = string.split(srv.hostaddr[0], '/')[0]
479 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
482 def recover(self, dev_name, new_conn):
485 recover %s""" %(dev_name, new_conn)
488 # add a route to a range
489 def add_route(self, net, gw, lo, hi):
497 except CommandError, e:
501 def del_route(self, net, gw, lo, hi):
506 quit """ % (net, gw, lo, hi)
509 # add a route to a host
510 def add_route_host(self, net, uuid, gw, tgt):
511 self.add_uuid(net, uuid, tgt)
519 except CommandError, e:
523 # add a route to a range
524 def del_route_host(self, net, uuid, gw, tgt):
530 quit """ % (net, gw, tgt)
534 def del_peer(self, net_type, nid, hostaddr):
535 if net_type in ('tcp',) and not config.lctl_dump:
539 del_peer %s %s single_share
543 elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump:
547 del_peer %s single_share
552 # disconnect one connection
553 def disconnect(self, srv):
554 self.del_uuid(srv.nid_uuid)
555 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
557 hostaddr = string.split(srv.hostaddr[0], '/')[0]
558 self.del_peer(srv.net_type, srv.nid, hostaddr)
560 def del_uuid(self, uuid):
568 def disconnectAll(self, net):
576 def attach(self, type, name, uuid):
579 quit""" % (type, name, uuid)
582 def setup(self, name, setup = ""):
586 quit""" % (name, setup)
590 # create a new device with lctl
591 def newdev(self, type, name, uuid, setup = ""):
592 self.attach(type, name, uuid);
594 self.setup(name, setup)
595 except CommandError, e:
596 self.cleanup(name, uuid, 0)
601 def cleanup(self, name, uuid, force, failover = 0):
602 if failover: force = 1
608 quit""" % (name, ('', 'force')[force],
609 ('', 'failover')[failover])
613 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
614 stripe_sz, stripe_off,
618 lov_setup %s %d %d %d %s %s
619 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
624 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
628 lov_setconfig %s %d %d %d %s %s
629 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
633 def dump(self, dump_file):
636 quit""" % (dump_file)
639 # get list of devices
640 def device_list(self):
641 devices = '/proc/fs/lustre/devices'
643 if os.access(devices, os.R_OK):
645 fp = open(devices, 'r')
653 def lustre_version(self):
654 rc, out = self.runcmd('version')
658 def mount_option(self, profile, osc, mdc):
660 mount_option %s %s %s
661 quit""" % (profile, osc, mdc)
664 # delete mount options
665 def del_mount_option(self, profile):
671 def set_timeout(self, timeout):
677 # delete mount options
678 def set_lustre_upcall(self, upcall):
683 # ============================================================
684 # Various system-level functions
685 # (ideally moved to their own module)
687 # Run a command and return the output and status.
688 # stderr is sent to /dev/null, could use popen3 to
689 # save it if necessary
692 if config.noexec: return (0, [])
693 f = os.popen(cmd + ' 2>&1')
703 cmd = string.join(map(str,args))
706 # Run a command in the background.
707 def run_daemon(*args):
708 cmd = string.join(map(str,args))
710 if config.noexec: return 0
711 f = os.popen(cmd + ' 2>&1')
719 # Determine full path to use for an external command
720 # searches dirname(argv[0]) first, then PATH
722 syspath = string.split(os.environ['PATH'], ':')
723 cmdpath = os.path.dirname(sys.argv[0])
724 syspath.insert(0, cmdpath);
726 syspath.insert(0, os.path.join(config.portals, 'utils/'))
728 prog = os.path.join(d,cmd)
729 if os.access(prog, os.X_OK):
733 # Recursively look for file starting at base dir
734 def do_find_file(base, mod):
735 fullname = os.path.join(base, mod)
736 if os.access(fullname, os.R_OK):
738 for d in os.listdir(base):
739 dir = os.path.join(base,d)
740 if os.path.isdir(dir):
741 module = do_find_file(dir, mod)
745 def find_module(src_dir, dev_dir, modname):
746 modbase = src_dir +'/'+ dev_dir +'/'+ modname
747 for modext in '.ko', '.o':
748 module = modbase + modext
750 if os.access(module, os.R_OK):
756 # is the path a block device?
763 return stat.S_ISBLK(s[stat.ST_MODE])
765 # build fs according to type
767 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
773 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
775 # devsize is in 1k, and fs block count is in 4k
776 block_cnt = devsize/4
778 if fstype in ('ext3', 'extN', 'ldiskfs'):
779 # ext3 journal size is in megabytes
782 if not is_block(dev):
783 ret, out = runcmd("ls -l %s" %dev)
784 devsize = int(string.split(out[0])[4]) / 1024
786 # sfdisk works for symlink, hardlink, and realdev
787 ret, out = runcmd("sfdisk -s %s" %dev)
789 devsize = int(out[0])
791 # sfdisk -s will fail for too large block device,
792 # then, read the size of partition from /proc/partitions
794 # get the realpath of the device
795 # it may be the real device, such as /dev/hda7
796 # or the hardlink created via mknod for a device
797 if 'realpath' in dir(os.path):
798 real_dev = os.path.realpath(dev)
802 while os.path.islink(real_dev) and (link_count < 20):
803 link_count = link_count + 1
804 dev_link = os.readlink(real_dev)
805 if os.path.isabs(dev_link):
808 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
810 panic("Encountered too many symbolic links resolving block device:", dev)
812 # get the major and minor number of the realpath via ls
813 # it seems python(os.stat) does not return
814 # the st_rdev member of the stat structure
815 ret, out = runcmd("ls -l %s" %real_dev)
816 major = string.split(string.split(out[0])[4], ",")[0]
817 minor = string.split(out[0])[5]
819 # get the devsize from /proc/partitions with the major and minor number
820 ret, out = runcmd("cat /proc/partitions")
823 if string.split(line)[0] == major and string.split(line)[1] == minor:
824 devsize = int(string.split(line)[2])
827 if devsize > 1024 * 1024:
828 jsize = ((devsize / 102400) * 4)
831 if jsize: jopt = "-J size=%d" %(jsize,)
832 if isize: iopt = "-I %d" %(isize,)
833 mkfs = 'mkfs.ext2 -j -b 4096 '
834 if not isblock or config.force:
836 elif fstype == 'reiserfs':
837 # reiserfs journal size is in blocks
838 if jsize: jopt = "--journal_size %d" %(jsize,)
839 mkfs = 'mkreiserfs -ff'
841 panic('unsupported fs type: ', fstype)
843 if config.mkfsoptions != None:
844 mkfs = mkfs + ' ' + config.mkfsoptions
845 if mkfsoptions != None:
846 mkfs = mkfs + ' ' + mkfsoptions
847 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
849 panic("Unable to build fs:", dev, string.join(out))
850 # enable hash tree indexing on fsswe
851 if fstype in ('ext3', 'extN', 'ldiskfs'):
852 htree = 'echo "feature FEATURE_C5" | debugfs -w'
853 (ret, out) = run (htree, dev)
855 panic("Unable to enable htree:", dev)
857 # some systems use /dev/loopN, some /dev/loop/N
861 if not os.access(loop + str(0), os.R_OK):
863 if not os.access(loop + str(0), os.R_OK):
864 panic("can't access loop devices")
867 # find loop device assigned to thefile
870 for n in xrange(0, MAX_LOOP_DEVICES):
872 if os.access(dev, os.R_OK):
873 (stat, out) = run('losetup', dev)
874 if out and stat == 0:
875 m = re.search(r'\((.*)\)', out[0])
876 if m and file == m.group(1):
882 # create file if necessary and assign the first free loop device
883 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
884 dev = find_loop(file)
886 print 'WARNING file:', file, 'already mapped to', dev
888 if reformat or not os.access(file, os.R_OK | os.W_OK):
890 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
891 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
894 panic("Unable to create backing store:", file)
895 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
898 # find next free loop
899 for n in xrange(0, MAX_LOOP_DEVICES):
901 if os.access(dev, os.R_OK):
902 (stat, out) = run('losetup', dev)
904 run('losetup', dev, file)
907 print "out of loop devices"
909 print "out of loop devices"
912 # undo loop assignment
913 def clean_loop(file):
914 dev = find_loop(file)
916 ret, out = run('losetup -d', dev)
918 log('unable to clean loop device:', dev, 'for file:', file)
921 # determine if dev is formatted as a <fstype> filesystem
922 def need_format(fstype, dev):
923 # FIXME don't know how to implement this
926 # initialize a block device if needed
927 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
928 inode_size, mkfsoptions):
929 if config.noexec: return dev
930 if not is_block(dev):
931 dev = init_loop(dev, size, fstype, journal_size, inode_size,
932 mkfsoptions, reformat)
933 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
934 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
937 # panic("device:", dev,
938 # "not prepared, and autoformat is not set.\n",
939 # "Rerun with --reformat option to format ALL filesystems")
944 """lookup IP address for an interface"""
945 rc, out = run("/sbin/ifconfig", iface)
948 addr = string.split(out[1])[1]
949 ip = string.split(addr, ':')[1]
952 def def_mount_options(fstype, target):
953 """returns deafult mount options for passed fstype and target (mds, ost)"""
954 if fstype == 'ext3' or fstype == 'ldiskfs':
955 mountfsoptions = "errors=remount-ro"
956 if target == 'ost' and sys_get_branch() == '2.4':
957 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
958 return mountfsoptions
961 def sys_get_elan_position_file():
962 procfiles = ["/proc/elan/device0/position",
963 "/proc/qsnet/elan4/device0/position",
964 "/proc/qsnet/elan3/device0/position"]
966 if os.access(p, os.R_OK):
970 def sys_get_local_nid(net_type, wildcard, cluster_id):
971 """Return the local nid."""
973 if sys_get_elan_position_file() and net_type == 'elan':
974 local = sys_get_local_address('elan', '*', cluster_id)
976 local = sys_get_local_address(net_type, wildcard, cluster_id)
979 def sys_get_local_address(net_type, wildcard, cluster_id):
980 """Return the local address for the network type."""
982 if net_type in ('tcp','openib','iib','vib','ra'):
984 iface, star = string.split(wildcard, ':')
985 local = if2addr(iface)
987 panic("unable to determine ip for:", wildcard)
989 host = socket.gethostname()
990 local = socket.gethostbyname(host)
991 elif net_type == 'elan':
992 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
993 f = sys_get_elan_position_file()
995 panic("unable to determine local Elan ID")
998 lines = fp.readlines()
1002 if a[0] == 'NodeId':
1006 nid = my_int(cluster_id) + my_int(elan_id)
1007 local = "%d" % (nid)
1008 except ValueError, e:
1012 elif net_type == 'lo':
1013 fixme("automatic local address for loopback")
1014 elif net_type == 'gm':
1015 fixme("automatic local address for GM")
1019 def sys_get_branch():
1020 """Returns kernel release"""
1022 fp = open('/proc/sys/kernel/osrelease')
1023 lines = fp.readlines()
1027 version = string.split(l)
1028 a = string.split(version[0], '.')
1029 return a[0] + '.' + a[1]
1034 def mod_loaded(modname):
1035 """Check if a module is already loaded. Look in /proc/modules for it."""
1037 fp = open('/proc/modules')
1038 lines = fp.readlines()
1040 # please forgive my tired fingers for this one
1041 ret = filter(lambda word, mod=modname: word == mod,
1042 map(lambda line: string.split(line)[0], lines))
1044 except Exception, e:
1047 # XXX: instead of device_list, ask for $name and see what we get
1048 def is_prepared(name):
1049 """Return true if a device exists for the name"""
1050 if config.lctl_dump:
1052 if (config.noexec or config.record) and config.cleanup:
1055 # expect this format:
1056 # 1 UP ldlm ldlm ldlm_UUID 2
1057 out = lctl.device_list()
1059 if name == string.split(s)[3]:
1061 except CommandError, e:
1065 def is_network_prepared():
1066 """If the any device exists, then assume that all networking
1067 has been configured"""
1068 out = lctl.device_list()
1071 def fs_is_mounted(path):
1072 """Return true if path is a mounted lustre filesystem"""
1074 fp = open('/proc/mounts')
1075 lines = fp.readlines()
1079 if a[1] == path and a[2] == 'lustre_lite':
1087 """Manage kernel modules"""
1088 def __init__(self, lustre_dir, portals_dir):
1089 self.lustre_dir = lustre_dir
1090 self.portals_dir = portals_dir
1091 self.kmodule_list = []
1093 def add_portals_module(self, dev_dir, modname):
1094 """Append a module to list of modules to load."""
1095 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1097 def add_lustre_module(self, dev_dir, modname):
1098 """Append a module to list of modules to load."""
1099 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1101 def load_module(self):
1102 """Load all the modules in the list in the order they appear."""
1103 for src_dir, dev_dir, mod in self.kmodule_list:
1104 if mod_loaded(mod) and not config.noexec:
1106 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1108 module = find_module(src_dir, dev_dir, mod)
1110 panic('module not found:', mod)
1111 (rc, out) = run('/sbin/insmod', module)
1112 if rc and not mod_loaded(mod):
1113 raise CommandError('insmod', out, rc)
1115 (rc, out) = run('/sbin/modprobe', mod)
1116 if rc and not mod_loaded(mod):
1117 raise CommandError('modprobe', out, rc)
1119 def cleanup_module(self):
1120 """Unload the modules in the list in reverse order."""
1122 rev = self.kmodule_list[:] # make *copy* of list
1124 for src_dir, dev_dir, mod in rev:
1125 if not mod_loaded(mod) and not config.noexec:
1128 if mod == 'portals' and config.dump:
1129 lctl.dump(config.dump)
1130 log('unloading module:', mod)
1131 (rc, out) = run('/sbin/rmmod', mod)
1133 log('! unable to unload module:', mod)
1136 # ============================================================
1137 # Classes to prepare and cleanup the various objects
1140 """ Base class for the rest of the modules. The default cleanup method is
1141 defined here, as well as some utilitiy funcs.
1143 def __init__(self, module_name, db):
1145 self.module_name = module_name
1146 self.name = self.db.getName()
1147 self.uuid = self.db.getUUID()
1150 self.kmod = kmod(config.lustre, config.portals)
1152 def info(self, *args):
1153 msg = string.join(map(str,args))
1154 print self.module_name + ":", self.name, self.uuid, msg
1157 """ default cleanup, used for most modules """
1160 lctl.cleanup(self.name, self.uuid, config.force)
1161 except CommandError, e:
1162 log(self.module_name, "cleanup failed: ", self.name)
1166 def add_portals_module(self, dev_dir, modname):
1167 """Append a module to list of modules to load."""
1168 self.kmod.add_portals_module(dev_dir, modname)
1170 def add_lustre_module(self, dev_dir, modname):
1171 """Append a module to list of modules to load."""
1172 self.kmod.add_lustre_module(dev_dir, modname)
1174 def load_module(self):
1175 """Load all the modules in the list in the order they appear."""
1176 self.kmod.load_module()
1178 def cleanup_module(self):
1179 """Unload the modules in the list in reverse order."""
1180 if self.safe_to_clean():
1181 self.kmod.cleanup_module()
1183 def safe_to_clean(self):
1186 def safe_to_clean_modules(self):
1187 return self.safe_to_clean()
1189 class Network(Module):
1190 def __init__(self,db):
1191 Module.__init__(self, 'NETWORK', db)
1192 self.net_type = self.db.get_val('nettype')
1193 self.nid = self.db.get_val('nid', '*')
1194 self.cluster_id = self.db.get_val('clusterid', "0")
1195 self.port = self.db.get_val_int('port', 0)
1198 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1200 panic("unable to set nid for", self.net_type, self.nid, self.cluster_id)
1201 self.generic_nid = 1
1202 debug("nid:", self.nid)
1204 self.generic_nid = 0
1206 self.nid_uuid = self.nid_to_uuid(self.nid)
1208 self.hostaddr = self.db.get_hostaddr()
1209 if len(self.hostaddr) == 0:
1210 self.hostaddr.append(self.nid)
1211 if '*' in self.hostaddr[0]:
1212 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1213 if not self.hostaddr[0]:
1214 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1215 debug("hostaddr:", self.hostaddr[0])
1217 self.add_portals_module("libcfs", 'libcfs')
1218 self.add_portals_module("portals", 'portals')
1219 if node_needs_router():
1220 self.add_portals_module("router", 'kptlrouter')
1221 if self.net_type == 'tcp':
1222 self.add_portals_module("knals/socknal", 'ksocknal')
1223 if self.net_type == 'elan':
1224 self.add_portals_module("knals/qswnal", 'kqswnal')
1225 if self.net_type == 'gm':
1226 self.add_portals_module("knals/gmnal", 'kgmnal')
1227 if self.net_type == 'openib':
1228 self.add_portals_module("knals/openibnal", 'kopenibnal')
1229 if self.net_type == 'iib':
1230 self.add_portals_module("knals/iibnal", 'kiibnal')
1231 if self.net_type == 'vib':
1232 self.add_portals_module("knals/vibnal", 'kvibnal')
1233 if self.net_type == 'lo':
1234 self.add_portals_module("knals/lonal", 'klonal')
1235 if self.net_type == 'ra':
1236 self.add_portals_module("knals/ranal", 'kranal')
1238 def nid_to_uuid(self, nid):
1239 return "NID_%s_UUID" %(nid,)
1242 if is_network_prepared():
1244 self.info(self.net_type, self.nid, self.port)
1245 if not (config.record and self.generic_nid):
1246 lctl.network(self.net_type, self.nid)
1247 if self.net_type == 'tcp':
1249 for hostaddr in self.db.get_hostaddr():
1250 ip = string.split(hostaddr, '/')[0]
1251 if len(string.split(hostaddr, '/')) == 2:
1252 netmask = string.split(hostaddr, '/')[1]
1255 lctl.add_interface(self.net_type, ip, netmask)
1256 if self.net_type == 'elan':
1258 if self.net_type == 'openib':
1260 panic("no port set for", self.net_type, self.hostaddr[0])
1261 sysctl('/proc/sys/openibnal/port', self.port)
1262 if self.net_type == 'ra':
1264 panic("no port set for", self.net_type, self.hostaddr[0])
1265 sysctl('/proc/sys/ranal/port', self.port)
1266 if self.port and node_is_router():
1267 run_one_acceptor(self.port)
1268 self.connect_peer_gateways()
1270 def connect_peer_gateways(self):
1271 for router in self.db.lookup_class('node'):
1272 if router.get_val_int('router', 0):
1273 for netuuid in router.get_networks():
1274 net = self.db.lookup(netuuid)
1276 if (gw.cluster_id == self.cluster_id and
1277 gw.net_type == self.net_type):
1278 if gw.nid != self.nid:
1281 def disconnect_peer_gateways(self):
1282 for router in self.db.lookup_class('node'):
1283 if router.get_val_int('router', 0):
1284 for netuuid in router.get_networks():
1285 net = self.db.lookup(netuuid)
1287 if (gw.cluster_id == self.cluster_id and
1288 gw.net_type == self.net_type):
1289 if gw.nid != self.nid:
1292 except CommandError, e:
1293 print "disconnect failed: ", self.name
1297 def safe_to_clean(self):
1298 return not is_network_prepared()
1301 self.info(self.net_type, self.nid, self.port)
1303 stop_acceptor(self.port)
1304 if node_is_router():
1305 self.disconnect_peer_gateways()
1306 if self.net_type == 'tcp':
1307 for hostaddr in self.db.get_hostaddr():
1308 ip = string.split(hostaddr, '/')[0]
1309 lctl.del_interface(self.net_type, ip)
1311 class RouteTable(Module):
1312 def __init__(self,db):
1313 Module.__init__(self, 'ROUTES', db)
1315 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1317 # only setup connections for tcp, ib, and ra NALs
1319 if not net_type in ('tcp','openib','iib','vib','ra'):
1322 # connect to target if route is to single node and this node is the gw
1323 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1324 if not local_cluster(net_type, tgt_cluster_id):
1325 panic("target", lo, " not on the local cluster")
1326 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1327 # connect to gateway if this node is not the gw
1328 elif (local_cluster(net_type, gw_cluster_id)
1329 and not local_interface(net_type, gw_cluster_id, gw)):
1330 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1335 panic("no server for nid", lo)
1338 return Network(srvdb)
1341 if is_network_prepared():
1344 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1345 lctl.add_route(net_type, gw, lo, hi)
1346 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1350 def safe_to_clean(self):
1351 return not is_network_prepared()
1354 if is_network_prepared():
1355 # the network is still being used, don't clean it up
1357 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1358 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1361 lctl.disconnect(srv)
1362 except CommandError, e:
1363 print "disconnect failed: ", self.name
1368 lctl.del_route(net_type, gw, lo, hi)
1369 except CommandError, e:
1370 print "del_route failed: ", self.name
1374 class Management(Module):
1375 def __init__(self, db):
1376 Module.__init__(self, 'MGMT', db)
1377 self.add_lustre_module('lvfs', 'lvfs')
1378 self.add_lustre_module('obdclass', 'obdclass')
1379 self.add_lustre_module('ptlrpc', 'ptlrpc')
1380 self.add_lustre_module('mgmt', 'mgmt_svc')
1383 if is_prepared(self.name):
1386 lctl.newdev("mgmt", self.name, self.uuid)
1388 def safe_to_clean(self):
1392 if is_prepared(self.name):
1393 Module.cleanup(self)
1395 # This is only needed to load the modules; the LDLM device
1396 # is now created automatically.
1398 def __init__(self,db):
1399 Module.__init__(self, 'LDLM', db)
1400 self.add_lustre_module('lvfs', 'lvfs')
1401 self.add_lustre_module('obdclass', 'obdclass')
1402 self.add_lustre_module('ptlrpc', 'ptlrpc')
1411 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1412 Module.__init__(self, 'LOV', db)
1413 if name_override != None:
1414 self.name = "lov_%s" % name_override
1415 self.add_lustre_module('lov', 'lov')
1416 self.mds_uuid = self.db.get_first_ref('mds')
1417 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1418 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1419 self.pattern = self.db.get_val_int('stripepattern', 0)
1420 self.devlist = self.db.get_refs('obd')
1421 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1423 self.desc_uuid = self.uuid
1424 self.uuid = generate_client_uuid(self.name)
1425 self.fs_name = fs_name
1427 self.config_only = 1
1429 self.config_only = None
1430 mds= self.db.lookup(self.mds_uuid)
1431 self.mds_name = mds.getName()
1432 for obd_uuid in self.devlist:
1433 obd = self.db.lookup(obd_uuid)
1434 osc = get_osc(obd, self.uuid, fs_name)
1436 self.osclist.append(osc)
1438 panic('osc not found:', obd_uuid)
1441 if is_prepared(self.name):
1443 if self.config_only:
1444 panic("Can't prepare config_only LOV ", self.name)
1446 for osc in self.osclist:
1448 # Only ignore connect failures with --force, which
1449 # isn't implemented here yet.
1450 osc.prepare(ignore_connect_failure=0)
1451 except CommandError, e:
1452 print "Error preparing OSC %s\n" % osc.uuid
1454 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1455 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1456 lctl.lov_setup(self.name, self.uuid,
1457 self.desc_uuid, self.mds_name, self.stripe_cnt,
1458 self.stripe_sz, self.stripe_off, self.pattern,
1459 string.join(self.devlist))
1462 if is_prepared(self.name):
1463 Module.cleanup(self)
1464 if self.config_only:
1465 panic("Can't clean up config_only LOV ", self.name)
1466 for osc in self.osclist:
1469 def load_module(self):
1470 if self.config_only:
1471 panic("Can't load modules for config_only LOV ", self.name)
1472 for osc in self.osclist:
1475 Module.load_module(self)
1477 def cleanup_module(self):
1478 if self.config_only:
1479 panic("Can't cleanup modules for config_only LOV ", self.name)
1480 Module.cleanup_module(self)
1481 for osc in self.osclist:
1482 osc.cleanup_module()
1485 class MDSDEV(Module):
1486 def __init__(self,db):
1487 Module.__init__(self, 'MDSDEV', db)
1488 self.devpath = self.db.get_val('devpath','')
1489 self.size = self.db.get_val_int('devsize', 0)
1490 self.journal_size = self.db.get_val_int('journalsize', 0)
1491 self.fstype = self.db.get_val('fstype', '')
1492 self.nspath = self.db.get_val('nspath', '')
1493 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1494 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1495 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1496 target_uuid = self.db.get_first_ref('target')
1497 mds = self.db.lookup(target_uuid)
1498 self.name = mds.getName()
1499 self.filesystem_uuids = mds.get_refs('filesystem')
1500 # FIXME: if fstype not set, then determine based on kernel version
1501 self.format = self.db.get_val('autoformat', "no")
1502 if mds.get_val('failover', 0):
1503 self.failover_mds = 'f'
1505 self.failover_mds = 'n'
1506 active_uuid = get_active_target(mds)
1508 panic("No target device found:", target_uuid)
1509 if active_uuid == self.uuid:
1513 if self.active and config.group and config.group != mds.get_val('group'):
1516 self.inode_size = self.db.get_val_int('inodesize', 0)
1517 if self.inode_size == 0:
1518 # find the LOV for this MDS
1519 lovconfig_uuid = mds.get_first_ref('lovconfig')
1520 if not lovconfig_uuid:
1521 panic("No LOV config found for MDS ", mds.name)
1522 lovconfig = mds.lookup(lovconfig_uuid)
1523 lov_uuid = lovconfig.get_first_ref('lov')
1525 panic("No LOV found for lovconfig ", lovconfig.name)
1526 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1528 # default stripe count controls default inode_size
1529 if (lov.stripe_cnt > 0):
1530 stripe_count = lov.stripe_cnt
1532 stripe_count = len(lov.devlist)
1533 if stripe_count > 77:
1534 self.inode_size = 4096
1535 elif stripe_count > 34:
1536 self.inode_size = 2048
1537 elif stripe_count > 13:
1538 self.inode_size = 1024
1539 elif stripe_count > 2:
1540 self.inode_size = 512
1542 self.inode_size = 256
1544 self.target_dev_uuid = self.uuid
1545 self.uuid = target_uuid
1548 self.add_lustre_module('mdc', 'mdc')
1549 self.add_lustre_module('osc', 'osc')
1550 self.add_lustre_module('lov', 'lov')
1551 self.add_lustre_module('mds', 'mds')
1552 if self.fstype == 'ldiskfs':
1553 self.add_lustre_module('ldiskfs', 'ldiskfs')
1555 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1557 def load_module(self):
1559 Module.load_module(self)
1562 if is_prepared(self.name):
1565 debug(self.uuid, "not active")
1568 # run write_conf automatically, if --reformat used
1570 self.info(self.devpath, self.fstype, self.size, self.format)
1572 # never reformat here
1573 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1574 self.format, self.journal_size, self.inode_size,
1576 if not is_prepared('MDT'):
1577 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1579 mountfsoptions = def_mount_options(self.fstype, 'mds')
1581 if config.mountfsoptions:
1583 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1585 mountfsoptions = config.mountfsoptions
1586 if self.mountfsoptions:
1587 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1589 if self.mountfsoptions:
1591 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1593 mountfsoptions = self.mountfsoptions
1595 print 'MDS mount options: ' + mountfsoptions
1597 lctl.newdev("mds", self.name, self.uuid,
1598 setup ="%s %s %s %s" %(blkdev, self.fstype, self.name, mountfsoptions))
1599 except CommandError, e:
1601 panic("MDS is missing the config log. Need to run " +
1602 "lconf --write_conf.")
1606 def write_conf(self):
1607 if is_prepared(self.name):
1609 self.info(self.devpath, self.fstype, self.format)
1610 blkdev = block_dev(self.devpath, self.size, self.fstype,
1611 config.reformat, self.format, self.journal_size,
1612 self.inode_size, self.mkfsoptions)
1613 lctl.newdev("mds", self.name, self.uuid,
1614 setup ="%s %s" %(blkdev, self.fstype))
1616 # record logs for the MDS lov
1617 for uuid in self.filesystem_uuids:
1618 log("recording clients for filesystem:", uuid)
1619 fs = self.db.lookup(uuid)
1620 obd_uuid = fs.get_first_ref('obd')
1621 client_uuid = generate_client_uuid(self.name)
1622 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1625 lctl.clear_log(self.name, self.name)
1626 lctl.record(self.name, self.name)
1628 lctl.mount_option(self.name, client.get_name(), "")
1632 lctl.clear_log(self.name, self.name + '-clean')
1633 lctl.record(self.name, self.name + '-clean')
1635 lctl.del_mount_option(self.name)
1640 # record logs for each client
1642 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1644 config_options = CONFIG_FILE
1646 for node_db in self.db.lookup_class('node'):
1647 client_name = node_db.getName()
1648 for prof_uuid in node_db.get_refs('profile'):
1649 prof_db = node_db.lookup(prof_uuid)
1650 # refactor this into a funtion to test "clientness"
1652 for ref_class, ref_uuid in prof_db.get_all_refs():
1653 if ref_class in ('mountpoint','echoclient'):
1654 debug("recording", client_name)
1655 old_noexec = config.noexec
1657 noexec_opt = ('', '-n')
1658 ret, out = run (sys.argv[0],
1659 noexec_opt[old_noexec == 1],
1660 " -v --record --nomod",
1661 "--record_log", client_name,
1662 "--record_device", self.name,
1663 "--node", client_name,
1666 lctl.clear_log(self.name, client_name)
1669 panic("Record client log %s on %s failed" %(
1670 client_name, self.name))
1672 for s in out: log("record> ", string.strip(s))
1673 ret, out = run (sys.argv[0],
1674 noexec_opt[old_noexec == 1],
1675 "--cleanup -v --record --nomod",
1676 "--record_log", client_name + "-clean",
1677 "--record_device", self.name,
1678 "--node", client_name,
1681 # In this case, although 0-conf mount works but 0-conf umount
1682 # doesn't work. As a boring result, the user is forced to
1683 # cleanup client service manually again and again. So I prefer
1684 # deleting these two llogs together and let the user write_conf.
1685 lctl.clear_log(self.name, client_name)
1686 lctl.clear_log(self.name, client_name + '-clean')
1689 panic("Record client log %s on %s failed" %(
1690 client_name + '-clean', self.name))
1692 for s in out: log("record> ", string.strip(s))
1693 config.noexec = old_noexec
1695 lctl.cleanup(self.name, self.uuid, 0, 0)
1696 except CommandError, e:
1697 log(self.module_name, "cleanup failed: ", self.name)
1700 Module.cleanup(self)
1701 clean_loop(self.devpath)
1703 def msd_remaining(self):
1704 out = lctl.device_list()
1706 if string.split(s)[2] in ('mds',):
1709 def safe_to_clean(self):
1712 def safe_to_clean_modules(self):
1713 return not self.msd_remaining()
1717 debug(self.uuid, "not active")
1720 if is_prepared(self.name):
1722 lctl.cleanup(self.name, self.uuid, config.force,
1724 except CommandError, e:
1725 log(self.module_name, "cleanup failed: ", self.name)
1728 Module.cleanup(self)
1729 if not self.msd_remaining() and is_prepared('MDT'):
1731 lctl.cleanup("MDT", "MDT_UUID", config.force,
1733 except CommandError, e:
1734 print "cleanup failed: ", self.name
1737 clean_loop(self.devpath)
1740 def __init__(self, db):
1741 Module.__init__(self, 'OSD', db)
1742 self.osdtype = self.db.get_val('osdtype')
1743 self.devpath = self.db.get_val('devpath', '')
1744 self.size = self.db.get_val_int('devsize', 0)
1745 self.journal_size = self.db.get_val_int('journalsize', 0)
1746 self.inode_size = self.db.get_val_int('inodesize', 0)
1747 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1748 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1749 self.fstype = self.db.get_val('fstype', '')
1750 self.nspath = self.db.get_val('nspath', '')
1751 target_uuid = self.db.get_first_ref('target')
1752 ost = self.db.lookup(target_uuid)
1753 self.name = ost.getName()
1754 self.format = self.db.get_val('autoformat', 'yes')
1755 if ost.get_val('failover', 0):
1756 self.failover_ost = 'f'
1758 self.failover_ost = 'n'
1760 active_uuid = get_active_target(ost)
1762 panic("No target device found:", target_uuid)
1763 if active_uuid == self.uuid:
1767 if self.active and config.group and config.group != ost.get_val('group'):
1770 self.target_dev_uuid = self.uuid
1771 self.uuid = target_uuid
1773 self.add_lustre_module('ost', 'ost')
1774 # FIXME: should we default to ext3 here?
1775 if self.fstype == 'ldiskfs':
1776 self.add_lustre_module('ldiskfs', 'ldiskfs')
1778 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1779 self.add_lustre_module(self.osdtype, self.osdtype)
1781 def load_module(self):
1783 Module.load_module(self)
1785 # need to check /proc/mounts and /etc/mtab before
1786 # formatting anything.
1787 # FIXME: check if device is already formatted.
1789 if is_prepared(self.name):
1792 debug(self.uuid, "not active")
1794 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1795 self.format, self.journal_size, self.inode_size)
1797 if self.osdtype == 'obdecho':
1800 blkdev = block_dev(self.devpath, self.size, self.fstype,
1801 config.reformat, self.format, self.journal_size,
1802 self.inode_size, self.mkfsoptions)
1804 mountfsoptions = def_mount_options(self.fstype, 'ost')
1806 if config.mountfsoptions:
1808 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1810 mountfsoptions = config.mountfsoptions
1811 if self.mountfsoptions:
1812 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1814 if self.mountfsoptions:
1816 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1818 mountfsoptions = self.mountfsoptions
1820 print 'OST mount options: ' + mountfsoptions
1822 lctl.newdev(self.osdtype, self.name, self.uuid,
1823 setup ="%s %s %s %s" %(blkdev, self.fstype,
1824 self.failover_ost, mountfsoptions))
1825 if not is_prepared('OSS'):
1826 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1828 def osd_remaining(self):
1829 out = lctl.device_list()
1831 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1834 def safe_to_clean(self):
1837 def safe_to_clean_modules(self):
1838 return not self.osd_remaining()
1842 debug(self.uuid, "not active")
1844 if is_prepared(self.name):
1847 lctl.cleanup(self.name, self.uuid, config.force,
1849 except CommandError, e:
1850 log(self.module_name, "cleanup failed: ", self.name)
1853 if not self.osd_remaining() and is_prepared('OSS'):
1855 lctl.cleanup("OSS", "OSS_UUID", config.force,
1857 except CommandError, e:
1858 print "cleanup failed: ", self.name
1861 if not self.osdtype == 'obdecho':
1862 clean_loop(self.devpath)
1864 def mgmt_uuid_for_fs(mtpt_name):
1867 mtpt_db = toplustreDB.lookup_name(mtpt_name)
1868 fs_uuid = mtpt_db.get_first_ref('filesystem')
1869 fs = toplustreDB.lookup(fs_uuid)
1872 return fs.get_first_ref('mgmt')
1874 # Generic client module, used by OSC and MDC
1875 class Client(Module):
1876 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1878 self.target_name = tgtdb.getName()
1879 self.target_uuid = tgtdb.getUUID()
1882 self.tgt_dev_uuid = get_active_target(tgtdb)
1883 if not self.tgt_dev_uuid:
1884 panic("No target device found for target:", self.target_name)
1886 self.kmod = kmod(config.lustre, config.portals)
1890 self.module = module
1891 self.module_name = string.upper(module)
1893 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1894 self.target_name, fs_name)
1896 self.name = self_name
1898 self.lookup_server(self.tgt_dev_uuid)
1899 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1901 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1904 self.fs_name = fs_name
1907 self.add_lustre_module(module_dir, module)
1909 def lookup_server(self, srv_uuid):
1910 """ Lookup a server's network information """
1911 self._server_nets = get_ost_net(self.db, srv_uuid)
1912 if len(self._server_nets) == 0:
1913 panic("Unable to find a server for:", srv_uuid)
1915 def get_servers(self):
1916 return self._server_nets
1918 def prepare(self, ignore_connect_failure = 0):
1919 self.info(self.target_uuid)
1920 if is_prepared(self.name):
1923 srv = choose_local_server(self.get_servers())
1927 routes = find_route(self.get_servers())
1928 if len(routes) == 0:
1929 panic("no route to", self.target_uuid)
1930 for (srv, r) in routes:
1931 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1932 except CommandError, e:
1933 if not ignore_connect_failure:
1937 if self.target_uuid in config.inactive and self.permits_inactive():
1938 debug("%s inactive" % self.target_uuid)
1939 inactive_p = "inactive"
1941 debug("%s active" % self.target_uuid)
1943 lctl.newdev(self.module, self.name, self.uuid,
1944 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1945 inactive_p, self.mgmt_name))
1948 if is_prepared(self.name):
1949 Module.cleanup(self)
1951 srv = choose_local_server(self.get_servers())
1953 lctl.disconnect(srv)
1955 for (srv, r) in find_route(self.get_servers()):
1956 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1957 except CommandError, e:
1958 log(self.module_name, "cleanup failed: ", self.name)
1964 def __init__(self, db, uuid, fs_name):
1965 Client.__init__(self, db, uuid, 'mdc', fs_name)
1967 def permits_inactive(self):
1971 def __init__(self, db, uuid, fs_name):
1972 Client.__init__(self, db, uuid, 'osc', fs_name)
1974 def permits_inactive(self):
1977 def mgmtcli_name_for_uuid(uuid):
1978 return 'MGMTCLI_%s' % uuid
1980 class ManagementClient(Client):
1981 def __init__(self, db, uuid):
1982 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1983 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1984 module_dir = 'mgmt')
1987 def __init__(self, db):
1988 Module.__init__(self, 'COBD', db)
1989 self.real_uuid = self.db.get_first_ref('realobd')
1990 self.cache_uuid = self.db.get_first_ref('cacheobd')
1991 self.add_lustre_module('cobd' , 'cobd')
1993 # need to check /proc/mounts and /etc/mtab before
1994 # formatting anything.
1995 # FIXME: check if device is already formatted.
1997 if is_prepared(self.name):
1999 self.info(self.real_uuid, self.cache_uuid)
2000 lctl.newdev("cobd", self.name, self.uuid,
2001 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
2004 # virtual interface for OSC and LOV
2006 def __init__(self, db, uuid, fs_name, name_override = None):
2007 Module.__init__(self, 'VOSC', db)
2008 if db.get_class() == 'lov':
2009 self.osc = LOV(db, uuid, fs_name, name_override)
2011 self.osc = get_osc(db, uuid, fs_name)
2013 return self.osc.uuid
2015 return self.osc.name
2020 def load_module(self):
2021 self.osc.load_module()
2022 def cleanup_module(self):
2023 self.osc.cleanup_module()
2026 class ECHO_CLIENT(Module):
2027 def __init__(self,db):
2028 Module.__init__(self, 'ECHO_CLIENT', db)
2029 self.add_lustre_module('obdecho', 'obdecho')
2030 self.obd_uuid = self.db.get_first_ref('obd')
2031 obd = self.db.lookup(self.obd_uuid)
2032 self.uuid = generate_client_uuid(self.name)
2033 self.osc = VOSC(obd, self.uuid, self.name)
2036 if is_prepared(self.name):
2039 self.osc.prepare() # XXX This is so cheating. -p
2040 self.info(self.obd_uuid)
2042 lctl.newdev("echo_client", self.name, self.uuid,
2043 setup = self.osc.get_name())
2046 if is_prepared(self.name):
2047 Module.cleanup(self)
2050 def load_module(self):
2051 self.osc.load_module()
2052 Module.load_module(self)
2054 def cleanup_module(self):
2055 Module.cleanup_module(self)
2056 self.osc.cleanup_module()
2059 def generate_client_uuid(name):
2060 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2062 int(random.random() * 1048576),
2063 int(random.random() * 1048576))
2064 return client_uuid[:36]
2067 def my_rstrip(s, chars):
2068 """my_rstrip(s, chars) -> strips any instances of the characters
2069 found in chars from the right side of string s"""
2070 # XXX required because python versions pre 2.2.3 don't allow
2071 #string.rstrip() to take alternate char lists
2075 ns = string.rstrip(s, '/')
2076 except TypeError, e:
2077 for i in range(len(s) - 1, 0, -1):
2086 class Mountpoint(Module):
2087 def __init__(self,db):
2088 Module.__init__(self, 'MTPT', db)
2089 self.path = my_rstrip(self.db.get_val('path'), '/')
2090 self.clientoptions = self.db.get_val('clientoptions', '')
2091 self.fs_uuid = self.db.get_first_ref('filesystem')
2092 fs = self.db.lookup(self.fs_uuid)
2093 self.mds_uuid = fs.get_first_ref('mds')
2094 self.obd_uuid = fs.get_first_ref('obd')
2095 self.mgmt_uuid = fs.get_first_ref('mgmt')
2096 obd = self.db.lookup(self.obd_uuid)
2097 client_uuid = generate_client_uuid(self.name)
2098 self.vosc = VOSC(obd, client_uuid, self.name)
2099 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
2101 self.add_lustre_module('mdc', 'mdc')
2102 self.add_lustre_module('llite', 'llite')
2104 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2110 if fs_is_mounted(self.path):
2111 log(self.path, "already mounted.")
2115 self.mgmtcli.prepare()
2118 mdc_name = self.mdc.name
2120 self.info(self.path, self.mds_uuid, self.obd_uuid)
2121 if config.record or config.lctl_dump:
2122 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
2125 if config.clientoptions:
2126 if self.clientoptions:
2127 self.clientoptions = self.clientoptions + ',' + config.clientoptions
2129 self.clientoptions = config.clientoptions
2130 if self.clientoptions:
2131 self.clientoptions = ',' + self.clientoptions
2132 # Linux kernel will deal with async and not pass it to ll_fill_super,
2133 # so replace it with Lustre async
2134 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
2136 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2137 (self.vosc.get_name(), mdc_name, self.clientoptions, config.config, self.path)
2138 run("mkdir", self.path)
2143 panic("mount failed:", self.path, ":", string.join(val))
2146 self.info(self.path, self.mds_uuid,self.obd_uuid)
2148 if config.record or config.lctl_dump:
2149 lctl.del_mount_option(local_node_name)
2151 if fs_is_mounted(self.path):
2153 (rc, out) = run("umount", "-f", self.path)
2155 (rc, out) = run("umount", self.path)
2157 raise CommandError('umount', out, rc)
2159 if fs_is_mounted(self.path):
2160 panic("fs is still mounted:", self.path)
2165 self.mgmtcli.cleanup()
2167 def load_module(self):
2169 self.mgmtcli.load_module()
2170 self.vosc.load_module()
2171 Module.load_module(self)
2173 def cleanup_module(self):
2174 Module.cleanup_module(self)
2175 self.vosc.cleanup_module()
2177 self.mgmtcli.cleanup_module()
2180 # ============================================================
2181 # misc query functions
2183 def get_ost_net(self, osd_uuid):
2187 osd = self.lookup(osd_uuid)
2188 node_uuid = osd.get_first_ref('node')
2189 node = self.lookup(node_uuid)
2191 panic("unable to find node for osd_uuid:", osd_uuid,
2192 " node_ref:", node_uuid)
2193 for net_uuid in node.get_networks():
2194 db = node.lookup(net_uuid)
2195 srv_list.append(Network(db))
2199 # the order of iniitailization is based on level.
2200 def getServiceLevel(self):
2201 type = self.get_class()
2203 if type in ('network',):
2205 elif type in ('routetbl',):
2207 elif type in ('ldlm',):
2209 elif type in ('mgmt',):
2211 elif type in ('osd', 'cobd'):
2213 elif type in ('mdsdev',):
2215 elif type in ('mountpoint', 'echoclient'):
2218 panic("Unknown type: ", type)
2220 if ret < config.minlevel or ret > config.maxlevel:
2225 # return list of services in a profile. list is a list of tuples
2226 # [(level, db_object),]
2227 def getServices(self):
2229 for ref_class, ref_uuid in self.get_all_refs():
2230 servdb = self.lookup(ref_uuid)
2232 level = getServiceLevel(servdb)
2234 list.append((level, servdb))
2236 panic('service not found: ' + ref_uuid)
2242 ############################################################
2244 # FIXME: clean this mess up!
2246 # OSC is no longer in the xml, so we have to fake it.
2247 # this is getting ugly and begging for another refactoring
2248 def get_osc(ost_db, uuid, fs_name):
2249 osc = OSC(ost_db, uuid, fs_name)
2252 def get_mdc(db, uuid, fs_name, mds_uuid):
2253 mds_db = db.lookup(mds_uuid);
2255 panic("no mds:", mds_uuid)
2256 mdc = MDC(mds_db, uuid, fs_name)
2259 ############################################################
2260 # routing ("rooting")
2262 # list of (nettype, cluster_id, nid)
2265 def find_local_clusters(node_db):
2266 global local_clusters
2267 for netuuid in node_db.get_networks():
2268 net = node_db.lookup(netuuid)
2270 debug("add_local", netuuid)
2271 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2273 if not acceptors.has_key(srv.port):
2274 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
2276 # This node is a gateway.
2278 def node_is_router():
2281 # If there are any routers found in the config, then this will be true
2282 # and all nodes will load kptlrouter.
2284 def node_needs_router():
2285 return needs_router or is_router
2287 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2288 # Currently, these local routes are only added to kptlrouter route
2289 # table if they are needed to connect to a specific server. This
2290 # should be changed so all available routes are loaded, and the
2291 # ptlrouter can make all the decisions.
2294 def find_local_routes(lustre):
2295 """ Scan the lustre config looking for routers . Build list of
2297 global local_routes, needs_router
2299 list = lustre.lookup_class('node')
2301 if router.get_val_int('router', 0):
2303 for (local_type, local_cluster_id, local_nid) in local_clusters:
2305 for netuuid in router.get_networks():
2306 db = router.lookup(netuuid)
2307 if (local_type == db.get_val('nettype') and
2308 local_cluster_id == db.get_val('clusterid')):
2309 gw = db.get_val('nid')
2312 debug("find_local_routes: gw is", gw)
2313 for route in router.get_local_routes(local_type, gw):
2314 local_routes.append(route)
2315 debug("find_local_routes:", local_routes)
2318 def choose_local_server(srv_list):
2319 for srv in srv_list:
2320 if local_cluster(srv.net_type, srv.cluster_id):
2323 def local_cluster(net_type, cluster_id):
2324 for cluster in local_clusters:
2325 if net_type == cluster[0] and cluster_id == cluster[1]:
2329 def local_interface(net_type, cluster_id, nid):
2330 for cluster in local_clusters:
2331 if (net_type == cluster[0] and cluster_id == cluster[1]
2332 and nid == cluster[2]):
2336 def find_route(srv_list):
2338 frm_type = local_clusters[0][0]
2339 for srv in srv_list:
2340 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2341 to_type = srv.net_type
2343 cluster_id = srv.cluster_id
2344 debug ('looking for route to', to_type, to)
2345 for r in local_routes:
2346 debug("find_route: ", r)
2347 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2348 result.append((srv, r))
2351 def get_active_target(db):
2352 target_uuid = db.getUUID()
2353 target_name = db.getName()
2354 node_name = get_select(target_name)
2356 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2358 tgt_dev_uuid = db.get_first_ref('active')
2361 def get_server_by_nid_uuid(db, nid_uuid):
2362 for n in db.lookup_class("network"):
2364 if net.nid_uuid == nid_uuid:
2368 ############################################################
2372 type = db.get_class()
2373 debug('Service:', type, db.getName(), db.getUUID())
2378 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2379 elif type == 'network':
2381 elif type == 'routetbl':
2385 elif type == 'cobd':
2387 elif type == 'mdsdev':
2389 elif type == 'mountpoint':
2391 elif type == 'echoclient':
2393 elif type == 'mgmt':
2396 panic("unknown service type:", type)
2400 # Prepare the system to run lustre using a particular profile
2401 # in a the configuration.
2402 # * load & the modules
2403 # * setup networking for the current node
2404 # * make sure partitions are in place and prepared
2405 # * initialize devices with lctl
2406 # Levels is important, and needs to be enforced.
2407 def for_each_profile(db, prof_list, operation):
2408 for prof_uuid in prof_list:
2409 prof_db = db.lookup(prof_uuid)
2411 panic("profile:", prof_uuid, "not found.")
2412 services = getServices(prof_db)
2415 def doWriteconf(services):
2419 if s[1].get_class() == 'mdsdev':
2420 n = newService(s[1])
2423 def doSetup(services):
2427 n = newService(s[1])
2430 def doModules(services):
2434 n = newService(s[1])
2437 def doCleanup(services):
2442 n = newService(s[1])
2443 if n.safe_to_clean():
2446 def doUnloadModules(services):
2451 n = newService(s[1])
2452 if n.safe_to_clean_modules():
2457 def doHost(lustreDB, hosts):
2458 global is_router, local_node_name
2461 node_db = lustreDB.lookup_name(h, 'node')
2465 panic('No host entry found.')
2467 local_node_name = node_db.get_val('name', 0)
2468 is_router = node_db.get_val_int('router', 0)
2469 lustre_upcall = node_db.get_val('lustreUpcall', '')
2470 portals_upcall = node_db.get_val('portalsUpcall', '')
2471 timeout = node_db.get_val_int('timeout', 0)
2472 ptldebug = node_db.get_val('ptldebug', '')
2473 subsystem = node_db.get_val('subsystem', '')
2475 find_local_clusters(node_db)
2477 find_local_routes(lustreDB)
2479 # Two step process: (1) load modules, (2) setup lustre
2480 # if not cleaning, load modules first.
2481 prof_list = node_db.get_refs('profile')
2483 if config.write_conf:
2484 for_each_profile(node_db, prof_list, doModules)
2486 for_each_profile(node_db, prof_list, doWriteconf)
2487 for_each_profile(node_db, prof_list, doUnloadModules)
2490 elif config.recover:
2491 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2492 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2493 "--client_uuid <UUID> --conn_uuid <UUID>")
2494 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2496 elif config.cleanup:
2497 if not mod_loaded('portals'):
2501 # the command line can override this value
2503 # ugly hack, only need to run lctl commands for --dump
2504 if config.lctl_dump or config.record:
2505 for_each_profile(node_db, prof_list, doCleanup)
2508 sys_set_timeout(timeout)
2509 sys_set_ptldebug(ptldebug)
2510 sys_set_subsystem(subsystem)
2511 sys_set_lustre_upcall(lustre_upcall)
2512 sys_set_portals_upcall(portals_upcall)
2514 for_each_profile(node_db, prof_list, doCleanup)
2515 for_each_profile(node_db, prof_list, doUnloadModules)
2519 # ugly hack, only need to run lctl commands for --dump
2520 if config.lctl_dump or config.record:
2521 sys_set_timeout(timeout)
2522 sys_set_lustre_upcall(lustre_upcall)
2523 for_each_profile(node_db, prof_list, doSetup)
2527 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2528 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2530 for_each_profile(node_db, prof_list, doModules)
2532 sys_set_debug_path()
2533 sys_set_ptldebug(ptldebug)
2534 sys_set_subsystem(subsystem)
2535 script = config.gdb_script
2536 run(lctl.lctl, ' modules >', script)
2538 log ("The GDB module script is in", script)
2539 # pause, so user has time to break and
2542 sys_set_timeout(timeout)
2543 sys_set_lustre_upcall(lustre_upcall)
2544 sys_set_portals_upcall(portals_upcall)
2546 for_each_profile(node_db, prof_list, doSetup)
2549 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2550 tgt = lustreDB.lookup(tgt_uuid)
2552 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2553 new_uuid = get_active_target(tgt)
2555 raise Lustre.LconfError("doRecovery: no active target found for: " +
2557 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2559 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2561 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2563 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2566 lctl.disconnect(oldnet)
2567 except CommandError, e:
2568 log("recover: disconnect", nid_uuid, "failed: ")
2573 except CommandError, e:
2574 log("recover: connect failed")
2577 lctl.recover(client_uuid, net.nid_uuid)
2580 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2581 base = os.path.dirname(cmd)
2582 if development_mode():
2583 if not config.lustre:
2584 debug('using objdir module paths')
2585 config.lustre = (os.path.join(base, ".."))
2586 # normalize the portals dir, using command line arg if set
2588 portals_dir = config.portals
2589 dir = os.path.join(config.lustre, portals_dir)
2590 config.portals = dir
2591 debug('config.portals', config.portals)
2592 elif config.lustre and config.portals:
2594 # if --lustre and --portals, normalize portals
2595 # can ignore POTRALS_DIR here, since it is probly useless here
2596 config.portals = os.path.join(config.lustre, config.portals)
2597 debug('config.portals B', config.portals)
2599 def sysctl(path, val):
2600 debug("+ sysctl", path, val)
2604 fp = open(os.path.join('/proc/sys', path), 'w')
2611 def sys_set_debug_path():
2612 sysctl('portals/debug_path', config.debug_path)
2614 def sys_set_lustre_upcall(upcall):
2615 # the command overrides the value in the node config
2616 if config.lustre_upcall:
2617 upcall = config.lustre_upcall
2619 upcall = config.upcall
2621 lctl.set_lustre_upcall(upcall)
2623 def sys_set_portals_upcall(upcall):
2624 # the command overrides the value in the node config
2625 if config.portals_upcall:
2626 upcall = config.portals_upcall
2628 upcall = config.upcall
2630 sysctl('portals/upcall', upcall)
2632 def sys_set_timeout(timeout):
2633 # the command overrides the value in the node config
2634 if config.timeout and config.timeout > 0:
2635 timeout = config.timeout
2636 if timeout != None and timeout > 0:
2637 lctl.set_timeout(timeout)
2639 def sys_tweak_socknal ():
2640 if config.single_socket:
2641 sysctl("socknal/typed", 0)
2643 def sys_optimize_elan ():
2644 procfiles = ["/proc/elan/config/eventint_punt_loops",
2645 "/proc/qsnet/elan3/config/eventint_punt_loops",
2646 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2648 if os.access(p, os.W_OK):
2649 run ("echo 1 > " + p)
2651 def sys_set_ptldebug(ptldebug):
2653 ptldebug = config.ptldebug
2656 val = eval(ptldebug, ptldebug_names)
2657 val = "0x%x" % (val)
2658 sysctl('portals/debug', val)
2659 except NameError, e:
2662 def sys_set_subsystem(subsystem):
2663 if config.subsystem:
2664 subsystem = config.subsystem
2667 val = eval(subsystem, subsystem_names)
2668 val = "0x%x" % (val)
2669 sysctl('portals/subsystem_debug', val)
2670 except NameError, e:
2673 def sys_set_netmem_max(path, max):
2674 debug("setting", path, "to at least", max)
2682 fp = open(path, 'w')
2683 fp.write('%d\n' %(max))
2687 def sys_make_devices():
2688 if not os.access('/dev/portals', os.R_OK):
2689 run('mknod /dev/portals c 10 240')
2690 if not os.access('/dev/obd', os.R_OK):
2691 run('mknod /dev/obd c 10 241')
2694 # Add dir to the global PATH, if not already there.
2695 def add_to_path(new_dir):
2696 syspath = string.split(os.environ['PATH'], ':')
2697 if new_dir in syspath:
2699 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2701 def default_debug_path():
2702 path = '/tmp/lustre-log'
2703 if os.path.isdir('/r'):
2708 def default_gdb_script():
2709 script = '/tmp/ogdb'
2710 if os.path.isdir('/r'):
2711 return '/r' + script
2716 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2717 # ensure basic elements are in the system path
2718 def sanitise_path():
2719 for dir in DEFAULT_PATH:
2722 # global hack for the --select handling
2724 def init_select(args):
2725 # args = [service=nodeA,service2=nodeB service3=nodeC]
2728 list = string.split(arg, ',')
2730 srv, node = string.split(entry, '=')
2731 tgt_select[srv] = node
2733 def get_select(srv):
2734 if tgt_select.has_key(srv):
2735 return tgt_select[srv]
2739 FLAG = Lustre.Options.FLAG
2740 PARAM = Lustre.Options.PARAM
2741 INTPARAM = Lustre.Options.INTPARAM
2742 PARAMLIST = Lustre.Options.PARAMLIST
2744 ('verbose,v', "Print system commands as they are run"),
2745 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2746 ('config', "Cluster config name used for LDAP query", PARAM),
2747 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2748 ('node', "Load config for <nodename>", PARAM),
2749 ('cleanup,d', "Cleans up config. (Shutdown)"),
2750 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2752 ('single_socket', "socknal option: only use one socket instead of bundle",
2754 ('failover',"""Used to shut down without saving state.
2755 This will allow this node to "give up" a service to a
2756 another node for failover purposes. This will not
2757 be a clean shutdown.""",
2759 ('gdb', """Prints message after creating gdb module script
2760 and sleeps for 5 seconds."""),
2761 ('noexec,n', """Prints the commands and steps that will be run for a
2762 config without executing them. This can used to check if a
2763 config file is doing what it should be doing"""),
2764 ('nomod', "Skip load/unload module step."),
2765 ('nosetup', "Skip device setup/cleanup step."),
2766 ('reformat', "Reformat all devices (without question)"),
2767 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2768 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2769 ('clientoptions', "Additional options for Lustre", PARAM),
2770 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2772 ('write_conf', "Save all the client config information on mds."),
2773 ('record', "Write config information on mds."),
2774 ('record_log', "Name of config record log.", PARAM),
2775 ('record_device', "MDS device name that will record the config commands",
2777 ('minlevel', "Minimum level of services to configure/cleanup",
2779 ('maxlevel', """Maximum level of services to configure/cleanup
2780 Levels are aproximatly like:
2785 70 - mountpoint, echo_client, osc, mdc, lov""",
2787 ('lustre', """Base directory of lustre sources. This parameter will
2788 cause lconf to load modules from a source tree.""", PARAM),
2789 ('portals', """Portals source directory. If this is a relative path,
2790 then it is assumed to be relative to lustre. """, PARAM),
2791 ('timeout', "Set recovery timeout", INTPARAM),
2792 ('upcall', "Set both portals and lustre upcall script", PARAM),
2793 ('lustre_upcall', "Set lustre upcall script", PARAM),
2794 ('portals_upcall', "Set portals upcall script", PARAM),
2795 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2796 ('ptldebug', "Set the portals debug level", PARAM),
2797 ('subsystem', "Set the portals debug subsystem", PARAM),
2798 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2799 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2800 # Client recovery options
2801 ('recover', "Recover a device"),
2802 ('group', "The group of devices to configure or cleanup", PARAM),
2803 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2804 ('client_uuid', "The failed client (required for recovery)", PARAM),
2805 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2807 ('inactive', """The name of an inactive service, to be ignored during
2808 mounting (currently OST-only). Can be repeated.""",
2813 global lctl, config, toplustreDB, CONFIG_FILE
2815 # in the upcall this is set to SIG_IGN
2816 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2818 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2820 config, args = cl.parse(sys.argv[1:])
2821 except Lustre.OptionError, e:
2825 setupModulePath(sys.argv[0])
2827 host = socket.gethostname()
2829 # the PRNG is normally seeded with time(), which is not so good for starting
2830 # time-synchronized clusters
2831 input = open('/dev/urandom', 'r')
2833 print 'Unable to open /dev/urandom!'
2835 seed = input.read(32)
2841 init_select(config.select)
2844 # allow config to be fetched via HTTP, but only with python2
2845 if sys.version[0] != '1' and args[0].startswith('http://'):
2848 config_file = urllib2.urlopen(args[0])
2849 except (urllib2.URLError, socket.error), err:
2850 if hasattr(err, 'args'):
2852 print "Could not access '%s': %s" %(args[0], err)
2854 elif not os.access(args[0], os.R_OK):
2855 print 'File not found or readable:', args[0]
2859 config_file = open(args[0], 'r')
2861 dom = xml.dom.minidom.parse(config_file)
2863 panic("%s does not appear to be a config file." % (args[0]))
2864 sys.exit(1) # make sure to die here, even in debug mode.
2866 CONFIG_FILE = args[0]
2867 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2868 if not config.config:
2869 config.config = os.path.basename(args[0])# use full path?
2870 if config.config[-4:] == '.xml':
2871 config.config = config.config[:-4]
2872 elif config.ldapurl:
2873 if not config.config:
2874 panic("--ldapurl requires --config name")
2875 dn = "config=%s,fs=lustre" % (config.config)
2876 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2877 elif config.ptldebug or config.subsystem:
2878 sys_set_ptldebug(None)
2879 sys_set_subsystem(None)
2882 print 'Missing config file or ldap URL.'
2883 print 'see lconf --help for command summary'
2886 toplustreDB = lustreDB
2888 ver = lustreDB.get_version()
2890 panic("No version found in config data, please recreate.")
2891 if ver != Lustre.CONFIG_VERSION:
2892 panic("Config version", ver, "does not match lconf version",
2893 Lustre.CONFIG_VERSION)
2897 node_list.append(config.node)
2900 node_list.append(host)
2901 node_list.append('localhost')
2903 debug("configuring for host: ", node_list)
2906 config.debug_path = config.debug_path + '-' + host
2907 config.gdb_script = config.gdb_script + '-' + host
2909 lctl = LCTLInterface('lctl')
2911 if config.lctl_dump:
2912 lctl.use_save_file(config.lctl_dump)
2915 if not (config.record_device and config.record_log):
2916 panic("When recording, both --record_log and --record_device must be specified.")
2917 lctl.clear_log(config.record_device, config.record_log)
2918 lctl.record(config.record_device, config.record_log)
2920 doHost(lustreDB, node_list)
2925 if __name__ == "__main__":
2928 except Lustre.LconfError, e:
2930 # traceback.print_exc(file=sys.stdout)
2932 except CommandError, e:
2936 if first_cleanup_error:
2937 sys.exit(first_cleanup_error)