3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = '../portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
91 "console" : (1 << 25),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
107 "pinger" : (1 << 12),
108 "filter" : (1 << 13),
113 "ptlrouter" : (1 << 18),
117 "confobd" : (1 << 22),
123 first_cleanup_error = 0
124 def cleanup_error(rc):
125 global first_cleanup_error
126 if not first_cleanup_error:
127 first_cleanup_error = rc
129 # ============================================================
130 # debugging and error funcs
132 def fixme(msg = "this feature"):
133 raise Lustre.LconfError, msg + ' not implemented yet.'
136 msg = string.join(map(str,args))
137 if not config.noexec:
138 raise Lustre.LconfError(msg)
143 msg = string.join(map(str,args))
148 print string.strip(s)
152 msg = string.join(map(str,args))
155 # ack, python's builtin int() does not support '0x123' syntax.
156 # eval can do it, although what a hack!
160 return eval(s, {}, {})
163 except SyntaxError, e:
164 raise ValueError("not a number")
166 raise ValueError("not a number")
168 # ============================================================
169 # locally defined exceptions
170 class CommandError (exceptions.Exception):
171 def __init__(self, cmd_name, cmd_err, rc=None):
172 self.cmd_name = cmd_name
173 self.cmd_err = cmd_err
178 if type(self.cmd_err) == types.StringType:
180 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
182 print "! %s: %s" % (self.cmd_name, self.cmd_err)
183 elif type(self.cmd_err) == types.ListType:
185 print "! %s (error %d):" % (self.cmd_name, self.rc)
187 print "! %s:" % (self.cmd_name)
188 for s in self.cmd_err:
189 print "> %s" %(string.strip(s))
194 # ============================================================
195 # handle daemons, like the acceptor
197 """ Manage starting and stopping a daemon. Assumes daemon manages
198 it's own pid file. """
200 def __init__(self, cmd):
206 log(self.command, "already running.")
208 self.path = find_prog(self.command)
210 panic(self.command, "not found.")
211 ret, out = runcmd(self.path +' '+ self.command_line())
213 # FIXME: add this check can only narrow the race but can not avoid it
214 # completely, so I don't apply this method on inserting module.
215 if ret and not self.running():
216 raise CommandError(self.path, out, ret)
220 pid = self.read_pidfile()
222 log ("killing process", pid)
224 #time.sleep(1) # let daemon die
226 log("unable to kill", self.command, e)
228 log("unable to kill", self.command)
231 pid = self.read_pidfile()
241 def read_pidfile(self):
243 fp = open(self.pidfile(), 'r')
248 print "WARNING: invalid pid in %s, removed" % self.pidfile()
249 print "WARNING: You may need to stop acceptor by yourself"
250 os.unlink(self.pidfile())
255 def clean_pidfile(self):
256 """ Remove a stale pidfile """
257 log("removing stale pidfile:", self.pidfile())
259 os.unlink(self.pidfile())
261 log(self.pidfile(), e)
263 class AcceptorHandler(DaemonHandler):
264 def __init__(self, port, net_type):
265 DaemonHandler.__init__(self, "acceptor")
267 self.net_type = net_type
271 return "/var/run/%s-%d.pid" % (self.command, self.port)
273 def command_line(self):
274 return string.join(map(str,(self.flags, self.port)))
278 # start the acceptors
280 if config.lctl_dump or config.record:
282 for port in acceptors.keys():
283 daemon = acceptors[port]
284 if daemon.net_type == 'tcp' and not daemon.running():
287 def run_one_acceptor(port):
288 if config.lctl_dump or config.record:
290 if acceptors.has_key(port):
291 daemon = acceptors[port]
292 if daemon.net_type == 'tcp' and not daemon.running():
295 panic("run_one_acceptor: No acceptor defined for port:", port)
297 def stop_acceptor(port):
298 if acceptors.has_key(port):
299 daemon = acceptors[port]
300 if daemon.net_type == 'tcp' and daemon.running():
304 # ============================================================
305 # handle lctl interface
308 Manage communication with lctl
311 def __init__(self, cmd):
313 Initialize close by finding the lctl binary.
315 self.lctl = find_prog(cmd)
317 self.record_device = ''
320 debug('! lctl not found')
323 raise CommandError('lctl', "unable to find lctl binary.")
325 def use_save_file(self, file):
326 self.save_file = file
328 def record(self, dev_name, logname):
329 log("Recording log", logname, "on", dev_name)
330 self.record_device = dev_name
331 self.record_log = logname
333 def end_record(self):
334 log("End recording log", self.record_log, "on", self.record_device)
335 self.record_device = None
336 self.record_log = None
338 def set_nonblock(self, fd):
339 fl = fcntl.fcntl(fd, F_GETFL)
340 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
345 the cmds are written to stdin of lctl
346 lctl doesn't return errors when run in script mode, so
348 should modify command line to accept multiple commands, or
349 create complex command line options
353 cmds = '\n dump ' + self.save_file + '\n' + cmds
354 elif self.record_device:
358 %s""" % (self.record_device, self.record_log, cmds)
360 debug("+", cmd_line, cmds)
361 if config.noexec: return (0, [])
363 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
364 child.tochild.write(cmds + "\n")
365 child.tochild.close()
367 # From "Python Cookbook" from O'Reilly
368 outfile = child.fromchild
369 outfd = outfile.fileno()
370 self.set_nonblock(outfd)
371 errfile = child.childerr
372 errfd = errfile.fileno()
373 self.set_nonblock(errfd)
375 outdata = errdata = ''
378 ready = select.select([outfd,errfd],[],[]) # Wait for input
379 if outfd in ready[0]:
380 outchunk = outfile.read()
381 if outchunk == '': outeof = 1
382 outdata = outdata + outchunk
383 if errfd in ready[0]:
384 errchunk = errfile.read()
385 if errchunk == '': erreof = 1
386 errdata = errdata + errchunk
387 if outeof and erreof: break
388 # end of "borrowed" code
391 if os.WIFEXITED(ret):
392 rc = os.WEXITSTATUS(ret)
395 if rc or len(errdata):
396 raise CommandError(self.lctl, errdata, rc)
399 def runcmd(self, *args):
401 run lctl using the command line
403 cmd = string.join(map(str,args))
404 debug("+", self.lctl, cmd)
405 rc, out = run(self.lctl, cmd)
407 raise CommandError(self.lctl, out, rc)
411 def clear_log(self, dev, log):
412 """ clear an existing log """
417 quit """ % (dev, log)
420 def network(self, net, nid):
425 quit """ % (net, nid)
429 def add_interface(self, net, ip, netmask = ""):
430 """ add an interface """
434 quit """ % (net, ip, netmask)
437 # delete an interface
438 def del_interface(self, net, ip):
439 """ delete an interface """
446 # create a new connection
447 def add_uuid(self, net_type, uuid, nid):
448 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
451 def add_peer(self, net_type, nid, hostaddr, port):
452 if net_type in ('tcp','openib','ra') and not config.lctl_dump:
457 nid, hostaddr, port )
459 elif net_type in ('iib',) and not config.lctl_dump:
466 elif net_type in ('vib',) and not config.lctl_dump:
474 def connect(self, srv):
475 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
476 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
478 hostaddr = string.split(srv.hostaddr[0], '/')[0]
479 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
482 def recover(self, dev_name, new_conn):
485 recover %s""" %(dev_name, new_conn)
488 # add a route to a range
489 def add_route(self, net, gw, lo, hi):
497 except CommandError, e:
501 def del_route(self, net, gw, lo, hi):
506 quit """ % (net, gw, lo, hi)
509 # add a route to a host
510 def add_route_host(self, net, uuid, gw, tgt):
511 self.add_uuid(net, uuid, tgt)
519 except CommandError, e:
523 # add a route to a range
524 def del_route_host(self, net, uuid, gw, tgt):
530 quit """ % (net, gw, tgt)
534 def del_peer(self, net_type, nid, hostaddr):
535 if net_type in ('tcp',) and not config.lctl_dump:
539 del_peer %s %s single_share
543 elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump:
547 del_peer %s single_share
552 # disconnect one connection
553 def disconnect(self, srv):
554 self.del_uuid(srv.nid_uuid)
555 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
557 hostaddr = string.split(srv.hostaddr[0], '/')[0]
558 self.del_peer(srv.net_type, srv.nid, hostaddr)
560 def del_uuid(self, uuid):
568 def disconnectAll(self, net):
576 def attach(self, type, name, uuid):
579 quit""" % (type, name, uuid)
582 def setup(self, name, setup = ""):
586 quit""" % (name, setup)
590 # create a new device with lctl
591 def newdev(self, type, name, uuid, setup = ""):
592 self.attach(type, name, uuid);
594 self.setup(name, setup)
595 except CommandError, e:
596 self.cleanup(name, uuid, 0)
601 def cleanup(self, name, uuid, force, failover = 0):
602 if failover: force = 1
608 quit""" % (name, ('', 'force')[force],
609 ('', 'failover')[failover])
613 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
614 stripe_sz, stripe_off,
618 lov_setup %s %d %d %d %s %s
619 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
624 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
628 lov_setconfig %s %d %d %d %s %s
629 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
633 def dump(self, dump_file):
636 quit""" % (dump_file)
639 # get list of devices
640 def device_list(self):
641 devices = '/proc/fs/lustre/devices'
643 if os.access(devices, os.R_OK):
645 fp = open(devices, 'r')
653 def lustre_version(self):
654 rc, out = self.runcmd('version')
658 def mount_option(self, profile, osc, mdc):
660 mount_option %s %s %s
661 quit""" % (profile, osc, mdc)
664 # delete mount options
665 def del_mount_option(self, profile):
671 def set_timeout(self, timeout):
677 # delete mount options
678 def set_lustre_upcall(self, upcall):
683 # ============================================================
684 # Various system-level functions
685 # (ideally moved to their own module)
687 # Run a command and return the output and status.
688 # stderr is sent to /dev/null, could use popen3 to
689 # save it if necessary
692 if config.noexec: return (0, [])
693 f = os.popen(cmd + ' 2>&1')
703 cmd = string.join(map(str,args))
706 # Run a command in the background.
707 def run_daemon(*args):
708 cmd = string.join(map(str,args))
710 if config.noexec: return 0
711 f = os.popen(cmd + ' 2>&1')
719 # Determine full path to use for an external command
720 # searches dirname(argv[0]) first, then PATH
722 syspath = string.split(os.environ['PATH'], ':')
723 cmdpath = os.path.dirname(sys.argv[0])
724 syspath.insert(0, cmdpath);
726 syspath.insert(0, os.path.join(config.portals, 'utils/'))
728 prog = os.path.join(d,cmd)
729 if os.access(prog, os.X_OK):
733 # Recursively look for file starting at base dir
734 def do_find_file(base, mod):
735 fullname = os.path.join(base, mod)
736 if os.access(fullname, os.R_OK):
738 for d in os.listdir(base):
739 dir = os.path.join(base,d)
740 if os.path.isdir(dir):
741 module = do_find_file(dir, mod)
745 def find_module(src_dir, dev_dir, modname):
746 modbase = src_dir +'/'+ dev_dir +'/'+ modname
747 for modext in '.ko', '.o':
748 module = modbase + modext
750 if os.access(module, os.R_OK):
756 # is the path a block device?
763 return stat.S_ISBLK(s[stat.ST_MODE])
765 # build fs according to type
767 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
773 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
775 # devsize is in 1k, and fs block count is in 4k
776 block_cnt = devsize/4
778 if fstype in ('ext3', 'extN', 'ldiskfs'):
779 # ext3 journal size is in megabytes
782 if not is_block(dev):
783 ret, out = runcmd("ls -l %s" %dev)
784 devsize = int(string.split(out[0])[4]) / 1024
786 # sfdisk works for symlink, hardlink, and realdev
787 ret, out = runcmd("sfdisk -s %s" %dev)
789 devsize = int(out[0])
791 # sfdisk -s will fail for too large block device,
792 # then, read the size of partition from /proc/partitions
794 # get the realpath of the device
795 # it may be the real device, such as /dev/hda7
796 # or the hardlink created via mknod for a device
797 if 'realpath' in dir(os.path):
798 real_dev = os.path.realpath(dev)
802 while os.path.islink(real_dev) and (link_count < 20):
803 link_count = link_count + 1
804 dev_link = os.readlink(real_dev)
805 if os.path.isabs(dev_link):
808 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
810 panic("Encountered too many symbolic links resolving block device:", dev)
812 # get the major and minor number of the realpath via ls
813 # it seems python(os.stat) does not return
814 # the st_rdev member of the stat structure
815 ret, out = runcmd("ls -l %s" %real_dev)
816 major = string.split(string.split(out[0])[4], ",")[0]
817 minor = string.split(out[0])[5]
819 # get the devsize from /proc/partitions with the major and minor number
820 ret, out = runcmd("cat /proc/partitions")
823 if string.split(line)[0] == major and string.split(line)[1] == minor:
824 devsize = int(string.split(line)[2])
827 if devsize > 1024 * 1024:
828 jsize = ((devsize / 102400) * 4)
831 if jsize: jopt = "-J size=%d" %(jsize,)
832 if isize: iopt = "-I %d" %(isize,)
833 mkfs = 'mkfs.ext2 -j -b 4096 '
834 if not isblock or config.force:
836 elif fstype == 'reiserfs':
837 # reiserfs journal size is in blocks
838 if jsize: jopt = "--journal_size %d" %(jsize,)
839 mkfs = 'mkreiserfs -ff'
841 panic('unsupported fs type: ', fstype)
843 if config.mkfsoptions != None:
844 mkfs = mkfs + ' ' + config.mkfsoptions
845 if mkfsoptions != None:
846 mkfs = mkfs + ' ' + mkfsoptions
847 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
849 panic("Unable to build fs:", dev, string.join(out))
850 # enable hash tree indexing on fsswe
851 if fstype in ('ext3', 'extN', 'ldiskfs'):
852 htree = 'echo "feature FEATURE_C5" | debugfs -w'
853 (ret, out) = run (htree, dev)
855 panic("Unable to enable htree:", dev)
857 # some systems use /dev/loopN, some /dev/loop/N
861 if not os.access(loop + str(0), os.R_OK):
863 if not os.access(loop + str(0), os.R_OK):
864 panic("can't access loop devices")
867 # find loop device assigned to thefile
870 for n in xrange(0, MAX_LOOP_DEVICES):
872 if os.access(dev, os.R_OK):
873 (stat, out) = run('losetup', dev)
874 if out and stat == 0:
875 m = re.search(r'\((.*)\)', out[0])
876 if m and file == m.group(1):
882 # create file if necessary and assign the first free loop device
883 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
884 dev = find_loop(file)
886 print 'WARNING file:', file, 'already mapped to', dev
888 if reformat or not os.access(file, os.R_OK | os.W_OK):
890 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
891 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
894 panic("Unable to create backing store:", file)
895 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
898 # find next free loop
899 for n in xrange(0, MAX_LOOP_DEVICES):
901 if os.access(dev, os.R_OK):
902 (stat, out) = run('losetup', dev)
904 run('losetup', dev, file)
907 print "out of loop devices"
909 print "out of loop devices"
912 # undo loop assignment
913 def clean_loop(file):
914 dev = find_loop(file)
916 ret, out = run('losetup -d', dev)
918 log('unable to clean loop device:', dev, 'for file:', file)
921 # determine if dev is formatted as a <fstype> filesystem
922 def need_format(fstype, dev):
923 # FIXME don't know how to implement this
926 # initialize a block device if needed
927 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
928 inode_size, mkfsoptions):
929 if config.noexec: return dev
930 if not is_block(dev):
931 dev = init_loop(dev, size, fstype, journal_size, inode_size,
932 mkfsoptions, reformat)
933 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
934 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
937 # panic("device:", dev,
938 # "not prepared, and autoformat is not set.\n",
939 # "Rerun with --reformat option to format ALL filesystems")
944 """lookup IP address for an interface"""
945 rc, out = run("/sbin/ifconfig", iface)
948 addr = string.split(out[1])[1]
949 ip = string.split(addr, ':')[1]
952 def def_mount_options(fstype, target):
953 """returns deafult mount options for passed fstype and target (mds, ost)"""
954 if fstype == 'ext3' or fstype == 'ldiskfs':
955 mountfsoptions = "errors=remount-ro"
956 if target == 'ost' and sys_get_branch() == '2.4':
957 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
958 return mountfsoptions
961 def sys_get_elan_position_file():
962 procfiles = ["/proc/elan/device0/position",
963 "/proc/qsnet/elan4/device0/position",
964 "/proc/qsnet/elan3/device0/position"]
966 if os.access(p, os.R_OK):
970 def sys_get_local_nid(net_type, wildcard, cluster_id):
971 """Return the local nid."""
973 if sys_get_elan_position_file() and net_type == 'elan':
974 local = sys_get_local_address('elan', '*', cluster_id)
976 local = sys_get_local_address(net_type, wildcard, cluster_id)
979 def sys_get_local_address(net_type, wildcard, cluster_id):
980 """Return the local address for the network type."""
982 if net_type in ('tcp','openib','iib','vib','ra'):
984 iface, star = string.split(wildcard, ':')
985 local = if2addr(iface)
987 panic("unable to determine ip for:", wildcard)
989 host = socket.gethostname()
990 local = socket.gethostbyname(host)
991 elif net_type == 'elan':
992 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
993 f = sys_get_elan_position_file()
995 panic("unable to determine local Elan ID")
998 lines = fp.readlines()
1002 if a[0] == 'NodeId':
1006 nid = my_int(cluster_id) + my_int(elan_id)
1007 local = "%d" % (nid)
1008 except ValueError, e:
1012 elif net_type == 'lo':
1013 fixme("automatic local address for loopback")
1014 elif net_type == 'gm':
1015 fixme("automatic local address for GM")
1019 def sys_get_branch():
1020 """Returns kernel release"""
1022 fp = open('/proc/sys/kernel/osrelease')
1023 lines = fp.readlines()
1027 version = string.split(l)
1028 a = string.split(version[0], '.')
1029 return a[0] + '.' + a[1]
1034 def mod_loaded(modname):
1035 """Check if a module is already loaded. Look in /proc/modules for it."""
1037 fp = open('/proc/modules')
1038 lines = fp.readlines()
1040 # please forgive my tired fingers for this one
1041 ret = filter(lambda word, mod=modname: word == mod,
1042 map(lambda line: string.split(line)[0], lines))
1044 except Exception, e:
1047 # XXX: instead of device_list, ask for $name and see what we get
1048 def is_prepared(name):
1049 """Return true if a device exists for the name"""
1050 if config.lctl_dump:
1052 if (config.noexec or config.record) and config.cleanup:
1055 # expect this format:
1056 # 1 UP ldlm ldlm ldlm_UUID 2
1057 out = lctl.device_list()
1059 if name == string.split(s)[3]:
1061 except CommandError, e:
1065 def is_network_prepared():
1066 """If the any device exists, then assume that all networking
1067 has been configured"""
1068 out = lctl.device_list()
1071 def fs_is_mounted(path):
1072 """Return true if path is a mounted lustre filesystem"""
1074 fp = open('/proc/mounts')
1075 lines = fp.readlines()
1079 if a[1] == path and a[2] == 'lustre_lite':
1087 """Manage kernel modules"""
1088 def __init__(self, lustre_dir, portals_dir):
1089 self.lustre_dir = lustre_dir
1090 self.portals_dir = portals_dir
1091 self.kmodule_list = []
1093 def add_portals_module(self, dev_dir, modname):
1094 """Append a module to list of modules to load."""
1095 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1097 def add_lustre_module(self, dev_dir, modname):
1098 """Append a module to list of modules to load."""
1099 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1101 def load_module(self):
1102 """Load all the modules in the list in the order they appear."""
1103 for src_dir, dev_dir, mod in self.kmodule_list:
1104 if mod_loaded(mod) and not config.noexec:
1106 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1108 module = find_module(src_dir, dev_dir, mod)
1110 panic('module not found:', mod)
1111 (rc, out) = run('/sbin/insmod', module)
1112 if rc and not mod_loaded(mod):
1113 raise CommandError('insmod', out, rc)
1115 (rc, out) = run('/sbin/modprobe', mod)
1116 if rc and not mod_loaded(mod):
1117 raise CommandError('modprobe', out, rc)
1119 def cleanup_module(self):
1120 """Unload the modules in the list in reverse order."""
1122 rev = self.kmodule_list[:] # make *copy* of list
1124 for src_dir, dev_dir, mod in rev:
1125 if not mod_loaded(mod) and not config.noexec:
1128 if mod == 'portals' and config.dump:
1129 lctl.dump(config.dump)
1130 log('unloading module:', mod)
1131 (rc, out) = run('/sbin/rmmod', mod)
1133 log('! unable to unload module:', mod)
1136 # ============================================================
1137 # Classes to prepare and cleanup the various objects
1140 """ Base class for the rest of the modules. The default cleanup method is
1141 defined here, as well as some utilitiy funcs.
1143 def __init__(self, module_name, db):
1145 self.module_name = module_name
1146 self.name = self.db.getName()
1147 self.uuid = self.db.getUUID()
1150 self.kmod = kmod(config.lustre, config.portals)
1152 def info(self, *args):
1153 msg = string.join(map(str,args))
1154 print self.module_name + ":", self.name, self.uuid, msg
1157 """ default cleanup, used for most modules """
1160 lctl.cleanup(self.name, self.uuid, config.force)
1161 except CommandError, e:
1162 log(self.module_name, "cleanup failed: ", self.name)
1166 def add_portals_module(self, dev_dir, modname):
1167 """Append a module to list of modules to load."""
1168 self.kmod.add_portals_module(dev_dir, modname)
1170 def add_lustre_module(self, dev_dir, modname):
1171 """Append a module to list of modules to load."""
1172 self.kmod.add_lustre_module(dev_dir, modname)
1174 def load_module(self):
1175 """Load all the modules in the list in the order they appear."""
1176 self.kmod.load_module()
1178 def cleanup_module(self):
1179 """Unload the modules in the list in reverse order."""
1180 if self.safe_to_clean():
1181 self.kmod.cleanup_module()
1183 def safe_to_clean(self):
1186 def safe_to_clean_modules(self):
1187 return self.safe_to_clean()
1189 class Network(Module):
1190 def __init__(self,db):
1191 Module.__init__(self, 'NETWORK', db)
1192 self.net_type = self.db.get_val('nettype')
1193 self.nid = self.db.get_val('nid', '*')
1194 self.cluster_id = self.db.get_val('clusterid', "0")
1195 self.port = self.db.get_val_int('port', 0)
1198 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1200 panic("unable to set nid for", self.net_type, self.nid, self.cluster_id)
1201 self.generic_nid = 1
1202 debug("nid:", self.nid)
1204 self.generic_nid = 0
1206 self.nid_uuid = self.nid_to_uuid(self.nid)
1208 self.hostaddr = self.db.get_hostaddr()
1209 if len(self.hostaddr) == 0:
1210 self.hostaddr.append(self.nid)
1211 if '*' in self.hostaddr[0]:
1212 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1213 if not self.hostaddr[0]:
1214 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1215 debug("hostaddr:", self.hostaddr[0])
1217 self.add_portals_module("libcfs", 'libcfs')
1218 self.add_portals_module("portals", 'portals')
1219 if node_needs_router():
1220 self.add_portals_module("router", 'kptlrouter')
1221 if self.net_type == 'tcp':
1222 self.add_portals_module("knals/socknal", 'ksocknal')
1223 if self.net_type == 'elan':
1224 self.add_portals_module("knals/qswnal", 'kqswnal')
1225 if self.net_type == 'gm':
1226 self.add_portals_module("knals/gmnal", 'kgmnal')
1227 if self.net_type == 'openib':
1228 self.add_portals_module("knals/openibnal", 'kopenibnal')
1229 if self.net_type == 'iib':
1230 self.add_portals_module("knals/iibnal", 'kiibnal')
1231 if self.net_type == 'vib':
1232 self.add_portals_module("knals/vibnal", 'kvibnal')
1233 if self.net_type == 'lo':
1234 self.add_portals_module("knals/lonal", 'klonal')
1235 if self.net_type == 'ra':
1236 self.add_portals_module("knals/ranal", 'kranal')
1238 def nid_to_uuid(self, nid):
1239 return "NID_%s_UUID" %(nid,)
1242 if is_network_prepared():
1244 self.info(self.net_type, self.nid, self.port)
1245 if not (config.record and self.generic_nid):
1246 lctl.network(self.net_type, self.nid)
1247 if self.net_type == 'tcp':
1249 for hostaddr in self.db.get_hostaddr():
1250 ip = string.split(hostaddr, '/')[0]
1251 if len(string.split(hostaddr, '/')) == 2:
1252 netmask = string.split(hostaddr, '/')[1]
1255 lctl.add_interface(self.net_type, ip, netmask)
1256 if self.net_type == 'elan':
1258 if self.net_type == 'openib':
1260 panic("no port set for", self.net_type, self.hostaddr[0])
1261 sysctl('/proc/sys/openibnal/port', self.port)
1262 if self.net_type == 'ra':
1264 panic("no port set for", self.net_type, self.hostaddr[0])
1265 sysctl('/proc/sys/ranal/port', self.port)
1266 if self.port and node_is_router():
1267 run_one_acceptor(self.port)
1268 self.connect_peer_gateways()
1270 def connect_peer_gateways(self):
1271 for router in self.db.lookup_class('node'):
1272 if router.get_val_int('router', 0):
1273 for netuuid in router.get_networks():
1274 net = self.db.lookup(netuuid)
1276 if (gw.cluster_id == self.cluster_id and
1277 gw.net_type == self.net_type):
1278 if gw.nid != self.nid:
1281 def disconnect_peer_gateways(self):
1282 for router in self.db.lookup_class('node'):
1283 if router.get_val_int('router', 0):
1284 for netuuid in router.get_networks():
1285 net = self.db.lookup(netuuid)
1287 if (gw.cluster_id == self.cluster_id and
1288 gw.net_type == self.net_type):
1289 if gw.nid != self.nid:
1292 except CommandError, e:
1293 print "disconnect failed: ", self.name
1297 def safe_to_clean(self):
1298 return not is_network_prepared()
1301 self.info(self.net_type, self.nid, self.port)
1303 stop_acceptor(self.port)
1304 if node_is_router():
1305 self.disconnect_peer_gateways()
1306 if self.net_type == 'tcp':
1307 for hostaddr in self.db.get_hostaddr():
1308 ip = string.split(hostaddr, '/')[0]
1309 lctl.del_interface(self.net_type, ip)
1311 class RouteTable(Module):
1312 def __init__(self,db):
1313 Module.__init__(self, 'ROUTES', db)
1315 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1317 # only setup connections for tcp, ib, and ra NALs
1319 if not net_type in ('tcp','openib','iib','vib','ra'):
1322 # connect to target if route is to single node and this node is the gw
1323 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1324 if not local_cluster(net_type, tgt_cluster_id):
1325 panic("target", lo, " not on the local cluster")
1326 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1327 # connect to gateway if this node is not the gw
1328 elif (local_cluster(net_type, gw_cluster_id)
1329 and not local_interface(net_type, gw_cluster_id, gw)):
1330 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1335 panic("no server for nid", lo)
1338 return Network(srvdb)
1341 if is_network_prepared():
1344 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1345 lctl.add_route(net_type, gw, lo, hi)
1346 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1350 def safe_to_clean(self):
1351 return not is_network_prepared()
1354 if is_network_prepared():
1355 # the network is still being used, don't clean it up
1357 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1358 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1361 lctl.disconnect(srv)
1362 except CommandError, e:
1363 print "disconnect failed: ", self.name
1368 lctl.del_route(net_type, gw, lo, hi)
1369 except CommandError, e:
1370 print "del_route failed: ", self.name
1374 class Management(Module):
1375 def __init__(self, db):
1376 Module.__init__(self, 'MGMT', db)
1377 self.add_lustre_module('lvfs', 'lvfs')
1378 self.add_lustre_module('obdclass', 'obdclass')
1379 self.add_lustre_module('ptlrpc', 'ptlrpc')
1380 self.add_lustre_module('mgmt', 'mgmt_svc')
1383 if is_prepared(self.name):
1386 lctl.newdev("mgmt", self.name, self.uuid)
1388 def safe_to_clean(self):
1392 if is_prepared(self.name):
1393 Module.cleanup(self)
1395 # This is only needed to load the modules; the LDLM device
1396 # is now created automatically.
1398 def __init__(self,db):
1399 Module.__init__(self, 'LDLM', db)
1400 self.add_lustre_module('lvfs', 'lvfs')
1401 self.add_lustre_module('obdclass', 'obdclass')
1402 self.add_lustre_module('ptlrpc', 'ptlrpc')
1411 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1412 Module.__init__(self, 'LOV', db)
1413 if name_override != None:
1414 self.name = "lov_%s" % name_override
1415 self.add_lustre_module('lov', 'lov')
1416 self.mds_uuid = self.db.get_first_ref('mds')
1417 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1418 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1419 self.pattern = self.db.get_val_int('stripepattern', 0)
1420 self.devlist = self.db.get_refs('obd')
1421 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1423 self.desc_uuid = self.uuid
1424 self.uuid = generate_client_uuid(self.name)
1425 self.fs_name = fs_name
1427 self.config_only = 1
1429 self.config_only = None
1430 mds= self.db.lookup(self.mds_uuid)
1431 self.mds_name = mds.getName()
1432 for obd_uuid in self.devlist:
1433 obd = self.db.lookup(obd_uuid)
1434 osc = get_osc(obd, self.uuid, fs_name)
1436 self.osclist.append(osc)
1438 panic('osc not found:', obd_uuid)
1441 if is_prepared(self.name):
1443 if self.config_only:
1444 panic("Can't prepare config_only LOV ", self.name)
1446 for osc in self.osclist:
1448 # Only ignore connect failures with --force, which
1449 # isn't implemented here yet.
1450 osc.prepare(ignore_connect_failure=0)
1451 except CommandError, e:
1452 print "Error preparing OSC %s\n" % osc.uuid
1454 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1455 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1456 lctl.lov_setup(self.name, self.uuid,
1457 self.desc_uuid, self.mds_name, self.stripe_cnt,
1458 self.stripe_sz, self.stripe_off, self.pattern,
1459 string.join(self.devlist))
1462 if is_prepared(self.name):
1463 Module.cleanup(self)
1464 if self.config_only:
1465 panic("Can't clean up config_only LOV ", self.name)
1466 for osc in self.osclist:
1469 def load_module(self):
1470 if self.config_only:
1471 panic("Can't load modules for config_only LOV ", self.name)
1472 for osc in self.osclist:
1475 Module.load_module(self)
1477 def cleanup_module(self):
1478 if self.config_only:
1479 panic("Can't cleanup modules for config_only LOV ", self.name)
1480 Module.cleanup_module(self)
1481 for osc in self.osclist:
1482 osc.cleanup_module()
1485 class MDSDEV(Module):
1486 def __init__(self,db):
1487 Module.__init__(self, 'MDSDEV', db)
1488 self.devpath = self.db.get_val('devpath','')
1489 self.size = self.db.get_val_int('devsize', 0)
1490 self.journal_size = self.db.get_val_int('journalsize', 0)
1491 self.fstype = self.db.get_val('fstype', '')
1492 self.nspath = self.db.get_val('nspath', '')
1493 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1494 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1495 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1496 target_uuid = self.db.get_first_ref('target')
1497 mds = self.db.lookup(target_uuid)
1498 self.name = mds.getName()
1499 self.filesystem_uuids = mds.get_refs('filesystem')
1500 # FIXME: if fstype not set, then determine based on kernel version
1501 self.format = self.db.get_val('autoformat', "no")
1502 if mds.get_val('failover', 0):
1503 self.failover_mds = 'f'
1505 self.failover_mds = 'n'
1506 active_uuid = get_active_target(mds)
1508 panic("No target device found:", target_uuid)
1509 if active_uuid == self.uuid:
1514 # make sure the filesystem is valid
1515 verify_devpath(self.devpath, self.name)
1517 if self.active and config.group and config.group != mds.get_val('group'):
1520 self.inode_size = self.db.get_val_int('inodesize', 0)
1521 if self.inode_size == 0:
1522 # find the LOV for this MDS
1523 lovconfig_uuid = mds.get_first_ref('lovconfig')
1524 if not lovconfig_uuid:
1525 panic("No LOV config found for MDS ", mds.name)
1526 lovconfig = mds.lookup(lovconfig_uuid)
1527 lov_uuid = lovconfig.get_first_ref('lov')
1529 panic("No LOV found for lovconfig ", lovconfig.name)
1530 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1532 # default stripe count controls default inode_size
1533 if (lov.stripe_cnt > 0):
1534 stripe_count = lov.stripe_cnt
1536 stripe_count = len(lov.devlist)
1537 if stripe_count > 77:
1538 self.inode_size = 4096
1539 elif stripe_count > 34:
1540 self.inode_size = 2048
1541 elif stripe_count > 13:
1542 self.inode_size = 1024
1543 elif stripe_count > 2:
1544 self.inode_size = 512
1546 self.inode_size = 256
1548 self.target_dev_uuid = self.uuid
1549 self.uuid = target_uuid
1552 self.add_lustre_module('mdc', 'mdc')
1553 self.add_lustre_module('osc', 'osc')
1554 self.add_lustre_module('lov', 'lov')
1555 self.add_lustre_module('mds', 'mds')
1556 if self.fstype == 'ldiskfs':
1557 self.add_lustre_module('ldiskfs', 'ldiskfs')
1559 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1561 def load_module(self):
1563 Module.load_module(self)
1566 if is_prepared(self.name):
1569 debug(self.uuid, "not active")
1572 # run write_conf automatically, if --reformat used
1574 self.info(self.devpath, self.fstype, self.size, self.format)
1576 # never reformat here
1577 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1578 self.format, self.journal_size, self.inode_size,
1580 if not is_prepared('MDT'):
1581 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1583 mountfsoptions = def_mount_options(self.fstype, 'mds')
1585 if config.mountfsoptions:
1587 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1589 mountfsoptions = config.mountfsoptions
1590 if self.mountfsoptions:
1591 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1593 if self.mountfsoptions:
1595 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1597 mountfsoptions = self.mountfsoptions
1599 print 'MDS mount options: ' + mountfsoptions
1601 lctl.newdev("mds", self.name, self.uuid,
1602 setup ="%s %s %s %s" %(blkdev, self.fstype, self.name, mountfsoptions))
1603 except CommandError, e:
1605 panic("MDS is missing the config log. Need to run " +
1606 "lconf --write_conf.")
1610 def write_conf(self):
1611 if is_prepared(self.name):
1613 self.info(self.devpath, self.fstype, self.format)
1614 blkdev = block_dev(self.devpath, self.size, self.fstype,
1615 config.reformat, self.format, self.journal_size,
1616 self.inode_size, self.mkfsoptions)
1617 lctl.newdev("mds", self.name, self.uuid,
1618 setup ="%s %s" %(blkdev, self.fstype))
1620 # record logs for the MDS lov
1621 for uuid in self.filesystem_uuids:
1622 log("recording clients for filesystem:", uuid)
1623 fs = self.db.lookup(uuid)
1624 obd_uuid = fs.get_first_ref('obd')
1625 client_uuid = generate_client_uuid(self.name)
1626 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1629 lctl.clear_log(self.name, self.name)
1630 lctl.record(self.name, self.name)
1632 lctl.mount_option(self.name, client.get_name(), "")
1636 lctl.clear_log(self.name, self.name + '-clean')
1637 lctl.record(self.name, self.name + '-clean')
1639 lctl.del_mount_option(self.name)
1644 # record logs for each client
1646 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1648 config_options = CONFIG_FILE
1650 for node_db in self.db.lookup_class('node'):
1651 client_name = node_db.getName()
1652 for prof_uuid in node_db.get_refs('profile'):
1653 prof_db = node_db.lookup(prof_uuid)
1654 # refactor this into a funtion to test "clientness"
1656 for ref_class, ref_uuid in prof_db.get_all_refs():
1657 if ref_class in ('mountpoint','echoclient'):
1658 debug("recording", client_name)
1659 old_noexec = config.noexec
1661 noexec_opt = ('', '-n')
1662 ret, out = run (sys.argv[0],
1663 noexec_opt[old_noexec == 1],
1664 " -v --record --nomod",
1665 "--record_log", client_name,
1666 "--record_device", self.name,
1667 "--node", client_name,
1670 lctl.clear_log(self.name, client_name)
1673 panic("Record client log %s on %s failed" %(
1674 client_name, self.name))
1676 for s in out: log("record> ", string.strip(s))
1677 ret, out = run (sys.argv[0],
1678 noexec_opt[old_noexec == 1],
1679 "--cleanup -v --record --nomod",
1680 "--record_log", client_name + "-clean",
1681 "--record_device", self.name,
1682 "--node", client_name,
1685 # In this case, although 0-conf mount works but 0-conf umount
1686 # doesn't work. As a boring result, the user is forced to
1687 # cleanup client service manually again and again. So I prefer
1688 # deleting these two llogs together and let the user write_conf.
1689 lctl.clear_log(self.name, client_name)
1690 lctl.clear_log(self.name, client_name + '-clean')
1693 panic("Record client log %s on %s failed" %(
1694 client_name + '-clean', self.name))
1696 for s in out: log("record> ", string.strip(s))
1697 config.noexec = old_noexec
1699 lctl.cleanup(self.name, self.uuid, 0, 0)
1700 except CommandError, e:
1701 log(self.module_name, "cleanup failed: ", self.name)
1704 Module.cleanup(self)
1705 clean_loop(self.devpath)
1707 def msd_remaining(self):
1708 out = lctl.device_list()
1710 if string.split(s)[2] in ('mds',):
1713 def safe_to_clean(self):
1716 def safe_to_clean_modules(self):
1717 return not self.msd_remaining()
1721 debug(self.uuid, "not active")
1724 if is_prepared(self.name):
1726 lctl.cleanup(self.name, self.uuid, config.force,
1728 except CommandError, e:
1729 log(self.module_name, "cleanup failed: ", self.name)
1732 Module.cleanup(self)
1733 if not self.msd_remaining() and is_prepared('MDT'):
1735 lctl.cleanup("MDT", "MDT_UUID", config.force,
1737 except CommandError, e:
1738 print "cleanup failed: ", self.name
1741 clean_loop(self.devpath)
1744 def __init__(self, db):
1745 Module.__init__(self, 'OSD', db)
1746 self.osdtype = self.db.get_val('osdtype')
1747 self.devpath = self.db.get_val('devpath', '')
1748 self.size = self.db.get_val_int('devsize', 0)
1749 self.journal_size = self.db.get_val_int('journalsize', 0)
1750 self.inode_size = self.db.get_val_int('inodesize', 0)
1751 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1752 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1753 self.fstype = self.db.get_val('fstype', '')
1754 self.nspath = self.db.get_val('nspath', '')
1755 target_uuid = self.db.get_first_ref('target')
1756 ost = self.db.lookup(target_uuid)
1757 self.name = ost.getName()
1758 self.format = self.db.get_val('autoformat', 'yes')
1759 if ost.get_val('failover', 0):
1760 self.failover_ost = 'f'
1762 self.failover_ost = 'n'
1764 active_uuid = get_active_target(ost)
1766 panic("No target device found:", target_uuid)
1767 if active_uuid == self.uuid:
1772 # make sure the filesystem is valid
1773 verify_devpath(self.devpath, self.name)
1776 if self.active and config.group and config.group != ost.get_val('group'):
1779 self.target_dev_uuid = self.uuid
1780 self.uuid = target_uuid
1782 self.add_lustre_module('ost', 'ost')
1783 # FIXME: should we default to ext3 here?
1784 if self.fstype == 'ldiskfs':
1785 self.add_lustre_module('ldiskfs', 'ldiskfs')
1787 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1788 self.add_lustre_module(self.osdtype, self.osdtype)
1790 def load_module(self):
1792 Module.load_module(self)
1794 # need to check /proc/mounts and /etc/mtab before
1795 # formatting anything.
1796 # FIXME: check if device is already formatted.
1798 if is_prepared(self.name):
1801 debug(self.uuid, "not active")
1803 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1804 self.format, self.journal_size, self.inode_size)
1806 if self.osdtype == 'obdecho':
1809 blkdev = block_dev(self.devpath, self.size, self.fstype,
1810 config.reformat, self.format, self.journal_size,
1811 self.inode_size, self.mkfsoptions)
1813 mountfsoptions = def_mount_options(self.fstype, 'ost')
1815 if config.mountfsoptions:
1817 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1819 mountfsoptions = config.mountfsoptions
1820 if self.mountfsoptions:
1821 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1823 if self.mountfsoptions:
1825 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1827 mountfsoptions = self.mountfsoptions
1829 print 'OST mount options: ' + mountfsoptions
1831 lctl.newdev(self.osdtype, self.name, self.uuid,
1832 setup ="%s %s %s %s" %(blkdev, self.fstype,
1833 self.failover_ost, mountfsoptions))
1834 if not is_prepared('OSS'):
1835 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1837 def osd_remaining(self):
1838 out = lctl.device_list()
1840 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1843 def safe_to_clean(self):
1846 def safe_to_clean_modules(self):
1847 return not self.osd_remaining()
1851 debug(self.uuid, "not active")
1853 if is_prepared(self.name):
1856 lctl.cleanup(self.name, self.uuid, config.force,
1858 except CommandError, e:
1859 log(self.module_name, "cleanup failed: ", self.name)
1862 if not self.osd_remaining() and is_prepared('OSS'):
1864 lctl.cleanup("OSS", "OSS_UUID", config.force,
1866 except CommandError, e:
1867 print "cleanup failed: ", self.name
1870 if not self.osdtype == 'obdecho':
1871 clean_loop(self.devpath)
1873 def mgmt_uuid_for_fs(mtpt_name):
1876 mtpt_db = toplustreDB.lookup_name(mtpt_name)
1877 fs_uuid = mtpt_db.get_first_ref('filesystem')
1878 fs = toplustreDB.lookup(fs_uuid)
1881 return fs.get_first_ref('mgmt')
1883 # Generic client module, used by OSC and MDC
1884 class Client(Module):
1885 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1887 self.target_name = tgtdb.getName()
1888 self.target_uuid = tgtdb.getUUID()
1891 self.tgt_dev_uuid = get_active_target(tgtdb)
1892 if not self.tgt_dev_uuid:
1893 panic("No target device found for target:", self.target_name)
1895 self.kmod = kmod(config.lustre, config.portals)
1899 self.module = module
1900 self.module_name = string.upper(module)
1902 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1903 self.target_name, fs_name)
1905 self.name = self_name
1907 self.lookup_server(self.tgt_dev_uuid)
1908 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1910 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1913 self.fs_name = fs_name
1916 self.add_lustre_module(module_dir, module)
1918 def lookup_server(self, srv_uuid):
1919 """ Lookup a server's network information """
1920 self._server_nets = get_ost_net(self.db, srv_uuid)
1921 if len(self._server_nets) == 0:
1922 panic("Unable to find a server for:", srv_uuid)
1924 def get_servers(self):
1925 return self._server_nets
1927 def prepare(self, ignore_connect_failure = 0):
1928 self.info(self.target_uuid)
1929 if is_prepared(self.name):
1932 srv = choose_local_server(self.get_servers())
1936 routes = find_route(self.get_servers())
1937 if len(routes) == 0:
1938 panic("no route to", self.target_uuid)
1939 for (srv, r) in routes:
1940 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1941 except CommandError, e:
1942 if not ignore_connect_failure:
1946 if self.target_uuid in config.inactive and self.permits_inactive():
1947 debug("%s inactive" % self.target_uuid)
1948 inactive_p = "inactive"
1950 debug("%s active" % self.target_uuid)
1952 lctl.newdev(self.module, self.name, self.uuid,
1953 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1954 inactive_p, self.mgmt_name))
1957 if is_prepared(self.name):
1958 Module.cleanup(self)
1960 srv = choose_local_server(self.get_servers())
1962 lctl.disconnect(srv)
1964 for (srv, r) in find_route(self.get_servers()):
1965 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1966 except CommandError, e:
1967 log(self.module_name, "cleanup failed: ", self.name)
1973 def __init__(self, db, uuid, fs_name):
1974 Client.__init__(self, db, uuid, 'mdc', fs_name)
1976 def permits_inactive(self):
1980 def __init__(self, db, uuid, fs_name):
1981 Client.__init__(self, db, uuid, 'osc', fs_name)
1983 def permits_inactive(self):
1986 def mgmtcli_name_for_uuid(uuid):
1987 return 'MGMTCLI_%s' % uuid
1989 class ManagementClient(Client):
1990 def __init__(self, db, uuid):
1991 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1992 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1993 module_dir = 'mgmt')
1996 def __init__(self, db):
1997 Module.__init__(self, 'COBD', db)
1998 self.real_uuid = self.db.get_first_ref('realobd')
1999 self.cache_uuid = self.db.get_first_ref('cacheobd')
2000 self.add_lustre_module('cobd' , 'cobd')
2002 # need to check /proc/mounts and /etc/mtab before
2003 # formatting anything.
2004 # FIXME: check if device is already formatted.
2006 if is_prepared(self.name):
2008 self.info(self.real_uuid, self.cache_uuid)
2009 lctl.newdev("cobd", self.name, self.uuid,
2010 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
2013 # virtual interface for OSC and LOV
2015 def __init__(self, db, uuid, fs_name, name_override = None):
2016 Module.__init__(self, 'VOSC', db)
2017 if db.get_class() == 'lov':
2018 self.osc = LOV(db, uuid, fs_name, name_override)
2020 self.osc = get_osc(db, uuid, fs_name)
2022 return self.osc.uuid
2024 return self.osc.name
2029 def load_module(self):
2030 self.osc.load_module()
2031 def cleanup_module(self):
2032 self.osc.cleanup_module()
2035 class ECHO_CLIENT(Module):
2036 def __init__(self,db):
2037 Module.__init__(self, 'ECHO_CLIENT', db)
2038 self.add_lustre_module('obdecho', 'obdecho')
2039 self.obd_uuid = self.db.get_first_ref('obd')
2040 obd = self.db.lookup(self.obd_uuid)
2041 self.uuid = generate_client_uuid(self.name)
2042 self.osc = VOSC(obd, self.uuid, self.name)
2045 if is_prepared(self.name):
2048 self.osc.prepare() # XXX This is so cheating. -p
2049 self.info(self.obd_uuid)
2051 lctl.newdev("echo_client", self.name, self.uuid,
2052 setup = self.osc.get_name())
2055 if is_prepared(self.name):
2056 Module.cleanup(self)
2059 def load_module(self):
2060 self.osc.load_module()
2061 Module.load_module(self)
2063 def cleanup_module(self):
2064 Module.cleanup_module(self)
2065 self.osc.cleanup_module()
2068 def generate_client_uuid(name):
2069 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2071 int(random.random() * 1048576),
2072 int(random.random() * 1048576))
2073 return client_uuid[:36]
2076 def my_rstrip(s, chars):
2077 """my_rstrip(s, chars) -> strips any instances of the characters
2078 found in chars from the right side of string s"""
2079 # XXX required because python versions pre 2.2.3 don't allow
2080 #string.rstrip() to take alternate char lists
2084 ns = string.rstrip(s, '/')
2085 except TypeError, e:
2086 for i in range(len(s) - 1, 0, -1):
2095 class Mountpoint(Module):
2096 def __init__(self,db):
2097 Module.__init__(self, 'MTPT', db)
2098 self.path = my_rstrip(self.db.get_val('path'), '/')
2099 self.clientoptions = self.db.get_val('clientoptions', '')
2100 self.fs_uuid = self.db.get_first_ref('filesystem')
2101 fs = self.db.lookup(self.fs_uuid)
2102 self.mds_uuid = fs.get_first_ref('mds')
2103 self.obd_uuid = fs.get_first_ref('obd')
2104 self.mgmt_uuid = fs.get_first_ref('mgmt')
2105 obd = self.db.lookup(self.obd_uuid)
2106 client_uuid = generate_client_uuid(self.name)
2107 self.vosc = VOSC(obd, client_uuid, self.name)
2108 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
2110 self.add_lustre_module('mdc', 'mdc')
2111 self.add_lustre_module('llite', 'llite')
2113 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2119 if fs_is_mounted(self.path):
2120 log(self.path, "already mounted.")
2124 self.mgmtcli.prepare()
2127 mdc_name = self.mdc.name
2129 self.info(self.path, self.mds_uuid, self.obd_uuid)
2130 if config.record or config.lctl_dump:
2131 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
2134 if config.clientoptions:
2135 if self.clientoptions:
2136 self.clientoptions = self.clientoptions + ',' + config.clientoptions
2138 self.clientoptions = config.clientoptions
2139 if self.clientoptions:
2140 self.clientoptions = ',' + self.clientoptions
2141 # Linux kernel will deal with async and not pass it to ll_fill_super,
2142 # so replace it with Lustre async
2143 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
2145 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2146 (self.vosc.get_name(), mdc_name, self.clientoptions, config.config, self.path)
2147 run("mkdir", self.path)
2152 panic("mount failed:", self.path, ":", string.join(val))
2155 self.info(self.path, self.mds_uuid,self.obd_uuid)
2157 if config.record or config.lctl_dump:
2158 lctl.del_mount_option(local_node_name)
2160 if fs_is_mounted(self.path):
2162 (rc, out) = run("umount", "-f", self.path)
2164 (rc, out) = run("umount", self.path)
2166 raise CommandError('umount', out, rc)
2168 if fs_is_mounted(self.path):
2169 panic("fs is still mounted:", self.path)
2174 self.mgmtcli.cleanup()
2176 def load_module(self):
2178 self.mgmtcli.load_module()
2179 self.vosc.load_module()
2180 Module.load_module(self)
2182 def cleanup_module(self):
2183 Module.cleanup_module(self)
2184 self.vosc.cleanup_module()
2186 self.mgmtcli.cleanup_module()
2189 # ============================================================
2190 # misc query functions
2192 def get_ost_net(self, osd_uuid):
2196 osd = self.lookup(osd_uuid)
2197 node_uuid = osd.get_first_ref('node')
2198 node = self.lookup(node_uuid)
2200 panic("unable to find node for osd_uuid:", osd_uuid,
2201 " node_ref:", node_uuid)
2202 for net_uuid in node.get_networks():
2203 db = node.lookup(net_uuid)
2204 srv_list.append(Network(db))
2208 # the order of iniitailization is based on level.
2209 def getServiceLevel(self):
2210 type = self.get_class()
2212 if type in ('network',):
2214 elif type in ('routetbl',):
2216 elif type in ('ldlm',):
2218 elif type in ('mgmt',):
2220 elif type in ('osd', 'cobd'):
2222 elif type in ('mdsdev',):
2224 elif type in ('mountpoint', 'echoclient'):
2227 panic("Unknown type: ", type)
2229 if ret < config.minlevel or ret > config.maxlevel:
2234 # return list of services in a profile. list is a list of tuples
2235 # [(level, db_object),]
2236 def getServices(self):
2238 for ref_class, ref_uuid in self.get_all_refs():
2239 servdb = self.lookup(ref_uuid)
2241 level = getServiceLevel(servdb)
2243 list.append((level, servdb))
2245 panic('service not found: ' + ref_uuid)
2251 ############################################################
2253 # FIXME: clean this mess up!
2255 # OSC is no longer in the xml, so we have to fake it.
2256 # this is getting ugly and begging for another refactoring
2257 def get_osc(ost_db, uuid, fs_name):
2258 osc = OSC(ost_db, uuid, fs_name)
2261 def get_mdc(db, uuid, fs_name, mds_uuid):
2262 mds_db = db.lookup(mds_uuid);
2264 panic("no mds:", mds_uuid)
2265 mdc = MDC(mds_db, uuid, fs_name)
2268 ############################################################
2269 # routing ("rooting")
2271 # list of (nettype, cluster_id, nid)
2274 def find_local_clusters(node_db):
2275 global local_clusters
2276 for netuuid in node_db.get_networks():
2277 net = node_db.lookup(netuuid)
2279 debug("add_local", netuuid)
2280 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2282 if not acceptors.has_key(srv.port):
2283 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
2285 # This node is a gateway.
2287 def node_is_router():
2290 # If there are any routers found in the config, then this will be true
2291 # and all nodes will load kptlrouter.
2293 def node_needs_router():
2294 return needs_router or is_router
2296 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2297 # Currently, these local routes are only added to kptlrouter route
2298 # table if they are needed to connect to a specific server. This
2299 # should be changed so all available routes are loaded, and the
2300 # ptlrouter can make all the decisions.
2303 def find_local_routes(lustre):
2304 """ Scan the lustre config looking for routers . Build list of
2306 global local_routes, needs_router
2308 list = lustre.lookup_class('node')
2310 if router.get_val_int('router', 0):
2312 for (local_type, local_cluster_id, local_nid) in local_clusters:
2314 for netuuid in router.get_networks():
2315 db = router.lookup(netuuid)
2316 if (local_type == db.get_val('nettype') and
2317 local_cluster_id == db.get_val('clusterid')):
2318 gw = db.get_val('nid')
2321 debug("find_local_routes: gw is", gw)
2322 for route in router.get_local_routes(local_type, gw):
2323 local_routes.append(route)
2324 debug("find_local_routes:", local_routes)
2327 def choose_local_server(srv_list):
2328 for srv in srv_list:
2329 if local_cluster(srv.net_type, srv.cluster_id):
2332 def local_cluster(net_type, cluster_id):
2333 for cluster in local_clusters:
2334 if net_type == cluster[0] and cluster_id == cluster[1]:
2338 def local_interface(net_type, cluster_id, nid):
2339 for cluster in local_clusters:
2340 if (net_type == cluster[0] and cluster_id == cluster[1]
2341 and nid == cluster[2]):
2345 def find_route(srv_list):
2347 frm_type = local_clusters[0][0]
2348 for srv in srv_list:
2349 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2350 to_type = srv.net_type
2352 cluster_id = srv.cluster_id
2353 debug ('looking for route to', to_type, to)
2354 for r in local_routes:
2355 debug("find_route: ", r)
2356 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2357 result.append((srv, r))
2360 def get_active_target(db):
2361 target_uuid = db.getUUID()
2362 target_name = db.getName()
2363 node_name = get_select(target_name)
2365 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2367 tgt_dev_uuid = db.get_first_ref('active')
2370 def get_server_by_nid_uuid(db, nid_uuid):
2371 for n in db.lookup_class("network"):
2373 if net.nid_uuid == nid_uuid:
2377 ############################################################
2381 type = db.get_class()
2382 debug('Service:', type, db.getName(), db.getUUID())
2387 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2388 elif type == 'network':
2390 elif type == 'routetbl':
2394 elif type == 'cobd':
2396 elif type == 'mdsdev':
2398 elif type == 'mountpoint':
2400 elif type == 'echoclient':
2402 elif type == 'mgmt':
2405 panic("unknown service type:", type)
2409 # Prepare the system to run lustre using a particular profile
2410 # in a the configuration.
2411 # * load & the modules
2412 # * setup networking for the current node
2413 # * make sure partitions are in place and prepared
2414 # * initialize devices with lctl
2415 # Levels is important, and needs to be enforced.
2416 def for_each_profile(db, prof_list, operation):
2417 for prof_uuid in prof_list:
2418 prof_db = db.lookup(prof_uuid)
2420 panic("profile:", prof_uuid, "not found.")
2421 services = getServices(prof_db)
2424 def doWriteconf(services):
2428 if s[1].get_class() == 'mdsdev':
2429 n = newService(s[1])
2432 def doSetup(services):
2436 n = newService(s[1])
2439 def doModules(services):
2443 n = newService(s[1])
2446 def doCleanup(services):
2451 n = newService(s[1])
2452 if n.safe_to_clean():
2455 def doUnloadModules(services):
2460 n = newService(s[1])
2461 if n.safe_to_clean_modules():
2466 def doHost(lustreDB, hosts):
2467 global is_router, local_node_name
2470 node_db = lustreDB.lookup_name(h, 'node')
2474 panic('No host entry found.')
2476 local_node_name = node_db.get_val('name', 0)
2477 is_router = node_db.get_val_int('router', 0)
2478 lustre_upcall = node_db.get_val('lustreUpcall', '')
2479 portals_upcall = node_db.get_val('portalsUpcall', '')
2480 timeout = node_db.get_val_int('timeout', 0)
2481 ptldebug = node_db.get_val('ptldebug', '')
2482 subsystem = node_db.get_val('subsystem', '')
2484 find_local_clusters(node_db)
2486 find_local_routes(lustreDB)
2488 # Two step process: (1) load modules, (2) setup lustre
2489 # if not cleaning, load modules first.
2490 prof_list = node_db.get_refs('profile')
2492 if config.write_conf:
2493 for_each_profile(node_db, prof_list, doModules)
2495 for_each_profile(node_db, prof_list, doWriteconf)
2496 for_each_profile(node_db, prof_list, doUnloadModules)
2499 elif config.recover:
2500 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2501 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2502 "--client_uuid <UUID> --conn_uuid <UUID>")
2503 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2505 elif config.cleanup:
2506 if not mod_loaded('portals'):
2510 # the command line can override this value
2512 # ugly hack, only need to run lctl commands for --dump
2513 if config.lctl_dump or config.record:
2514 for_each_profile(node_db, prof_list, doCleanup)
2517 sys_set_timeout(timeout)
2518 sys_set_ptldebug(ptldebug)
2519 sys_set_subsystem(subsystem)
2520 sys_set_lustre_upcall(lustre_upcall)
2521 sys_set_portals_upcall(portals_upcall)
2523 for_each_profile(node_db, prof_list, doCleanup)
2524 for_each_profile(node_db, prof_list, doUnloadModules)
2528 # ugly hack, only need to run lctl commands for --dump
2529 if config.lctl_dump or config.record:
2530 sys_set_timeout(timeout)
2531 sys_set_lustre_upcall(lustre_upcall)
2532 for_each_profile(node_db, prof_list, doSetup)
2536 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2537 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2539 for_each_profile(node_db, prof_list, doModules)
2541 sys_set_debug_path()
2542 sys_set_ptldebug(ptldebug)
2543 sys_set_subsystem(subsystem)
2544 script = config.gdb_script
2545 run(lctl.lctl, ' modules >', script)
2547 log ("The GDB module script is in", script)
2548 # pause, so user has time to break and
2551 sys_set_timeout(timeout)
2552 sys_set_lustre_upcall(lustre_upcall)
2553 sys_set_portals_upcall(portals_upcall)
2555 for_each_profile(node_db, prof_list, doSetup)
2558 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2559 tgt = lustreDB.lookup(tgt_uuid)
2561 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2562 new_uuid = get_active_target(tgt)
2564 raise Lustre.LconfError("doRecovery: no active target found for: " +
2566 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2568 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2570 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2572 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2575 lctl.disconnect(oldnet)
2576 except CommandError, e:
2577 log("recover: disconnect", nid_uuid, "failed: ")
2582 except CommandError, e:
2583 log("recover: connect failed")
2586 lctl.recover(client_uuid, net.nid_uuid)
2589 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2590 base = os.path.dirname(cmd)
2591 if development_mode():
2592 if not config.lustre:
2593 debug('using objdir module paths')
2594 config.lustre = (os.path.join(base, ".."))
2595 # normalize the portals dir, using command line arg if set
2597 portals_dir = config.portals
2598 dir = os.path.join(config.lustre, portals_dir)
2599 config.portals = dir
2600 debug('config.portals', config.portals)
2601 elif config.lustre and config.portals:
2603 # if --lustre and --portals, normalize portals
2604 # can ignore POTRALS_DIR here, since it is probly useless here
2605 config.portals = os.path.join(config.lustre, config.portals)
2606 debug('config.portals B', config.portals)
2608 def sysctl(path, val):
2609 debug("+ sysctl", path, val)
2613 fp = open(os.path.join('/proc/sys', path), 'w')
2620 def sys_set_debug_path():
2621 sysctl('portals/debug_path', config.debug_path)
2623 def sys_set_lustre_upcall(upcall):
2624 # the command overrides the value in the node config
2625 if config.lustre_upcall:
2626 upcall = config.lustre_upcall
2628 upcall = config.upcall
2630 lctl.set_lustre_upcall(upcall)
2632 def sys_set_portals_upcall(upcall):
2633 # the command overrides the value in the node config
2634 if config.portals_upcall:
2635 upcall = config.portals_upcall
2637 upcall = config.upcall
2639 sysctl('portals/upcall', upcall)
2641 def sys_set_timeout(timeout):
2642 # the command overrides the value in the node config
2643 if config.timeout and config.timeout > 0:
2644 timeout = config.timeout
2645 if timeout != None and timeout > 0:
2646 lctl.set_timeout(timeout)
2648 def sys_tweak_socknal ():
2649 if config.single_socket:
2650 sysctl("socknal/typed", 0)
2652 def sys_optimize_elan ():
2653 procfiles = ["/proc/elan/config/eventint_punt_loops",
2654 "/proc/qsnet/elan3/config/eventint_punt_loops",
2655 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2657 if os.access(p, os.W_OK):
2658 run ("echo 1 > " + p)
2660 def sys_set_ptldebug(ptldebug):
2662 ptldebug = config.ptldebug
2665 val = eval(ptldebug, ptldebug_names)
2666 val = "0x%x" % (val)
2667 sysctl('portals/debug', val)
2668 except NameError, e:
2671 def sys_set_subsystem(subsystem):
2672 if config.subsystem:
2673 subsystem = config.subsystem
2676 val = eval(subsystem, subsystem_names)
2677 val = "0x%x" % (val)
2678 sysctl('portals/subsystem_debug', val)
2679 except NameError, e:
2682 def sys_set_netmem_max(path, max):
2683 debug("setting", path, "to at least", max)
2691 fp = open(path, 'w')
2692 fp.write('%d\n' %(max))
2696 def sys_make_devices():
2697 if not os.access('/dev/portals', os.R_OK):
2698 run('mknod /dev/portals c 10 240')
2699 if not os.access('/dev/obd', os.R_OK):
2700 run('mknod /dev/obd c 10 241')
2703 # Add dir to the global PATH, if not already there.
2704 def add_to_path(new_dir):
2705 syspath = string.split(os.environ['PATH'], ':')
2706 if new_dir in syspath:
2708 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2710 def default_debug_path():
2711 path = '/tmp/lustre-log'
2712 if os.path.isdir('/r'):
2717 def default_gdb_script():
2718 script = '/tmp/ogdb'
2719 if os.path.isdir('/r'):
2720 return '/r' + script
2724 # make sure the filesystem is valid
2725 def verify_devpath(devpath, name):
2726 ret, out = runcmd("/sbin/dumpe2fs -h " + devpath)
2728 panic("The back-end storage device \""+devpath+"\", specified for Lustre service \""+name+"\", does not appear to contain a valid ext3 file system.\n\nThis could have several causes:\n- there is a mistake in the configuration file\n- the drivers for that device are not loaded\n- you forgot to reformat the device before starting Lustre for the first time\n\nIf you want to reformat, you can re-run this lconf command with --reformat, which will REFORMAT ALL CONFIGURED LUSTRE DEVICES ON THIS NODE, AND DESTROY ALL DATA ON THOSE DEVICES. Please use extreme caution with the --reformat command.\n")
2732 res = re.compile("^Filesystem.features:.*journal_dev.").search(line) if res:
2733 panic("The back-end storage device \""+devpath+"\", specified for Lustre service \""+name+"\", contains an ext3 external journal, instead of a proper ext3 file system.\n\nIt's possible that you meant to specify a different device, or that you meant to reformat this device before using it. If you are intending to use a file system with an external journal, then the Lustre configuration file should point to the device that contains the main ext3 file system, not the journal device.\n");
2737 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2738 # ensure basic elements are in the system path
2739 def sanitise_path():
2740 for dir in DEFAULT_PATH:
2743 # global hack for the --select handling
2745 def init_select(args):
2746 # args = [service=nodeA,service2=nodeB service3=nodeC]
2749 list = string.split(arg, ',')
2751 srv, node = string.split(entry, '=')
2752 tgt_select[srv] = node
2754 def get_select(srv):
2755 if tgt_select.has_key(srv):
2756 return tgt_select[srv]
2760 FLAG = Lustre.Options.FLAG
2761 PARAM = Lustre.Options.PARAM
2762 INTPARAM = Lustre.Options.INTPARAM
2763 PARAMLIST = Lustre.Options.PARAMLIST
2765 ('verbose,v', "Print system commands as they are run"),
2766 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2767 ('config', "Cluster config name used for LDAP query", PARAM),
2768 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2769 ('node', "Load config for <nodename>", PARAM),
2770 ('cleanup,d', "Cleans up config. (Shutdown)"),
2771 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2773 ('single_socket', "socknal option: only use one socket instead of bundle",
2775 ('failover',"""Used to shut down without saving state.
2776 This will allow this node to "give up" a service to a
2777 another node for failover purposes. This will not
2778 be a clean shutdown.""",
2780 ('gdb', """Prints message after creating gdb module script
2781 and sleeps for 5 seconds."""),
2782 ('noexec,n', """Prints the commands and steps that will be run for a
2783 config without executing them. This can used to check if a
2784 config file is doing what it should be doing"""),
2785 ('nomod', "Skip load/unload module step."),
2786 ('nosetup', "Skip device setup/cleanup step."),
2787 ('reformat', "Reformat all devices (without question)"),
2788 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2789 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2790 ('clientoptions', "Additional options for Lustre", PARAM),
2791 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2793 ('write_conf', "Save all the client config information on mds."),
2794 ('record', "Write config information on mds."),
2795 ('record_log', "Name of config record log.", PARAM),
2796 ('record_device', "MDS device name that will record the config commands",
2798 ('minlevel', "Minimum level of services to configure/cleanup",
2800 ('maxlevel', """Maximum level of services to configure/cleanup
2801 Levels are aproximatly like:
2806 70 - mountpoint, echo_client, osc, mdc, lov""",
2808 ('lustre', """Base directory of lustre sources. This parameter will
2809 cause lconf to load modules from a source tree.""", PARAM),
2810 ('portals', """Portals source directory. If this is a relative path,
2811 then it is assumed to be relative to lustre. """, PARAM),
2812 ('timeout', "Set recovery timeout", INTPARAM),
2813 ('upcall', "Set both portals and lustre upcall script", PARAM),
2814 ('lustre_upcall', "Set lustre upcall script", PARAM),
2815 ('portals_upcall', "Set portals upcall script", PARAM),
2816 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2817 ('ptldebug', "Set the portals debug level", PARAM),
2818 ('subsystem', "Set the portals debug subsystem", PARAM),
2819 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2820 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2821 # Client recovery options
2822 ('recover', "Recover a device"),
2823 ('group', "The group of devices to configure or cleanup", PARAM),
2824 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2825 ('client_uuid', "The failed client (required for recovery)", PARAM),
2826 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2828 ('inactive', """The name of an inactive service, to be ignored during
2829 mounting (currently OST-only). Can be repeated.""",
2834 global lctl, config, toplustreDB, CONFIG_FILE
2836 # in the upcall this is set to SIG_IGN
2837 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2839 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2841 config, args = cl.parse(sys.argv[1:])
2842 except Lustre.OptionError, e:
2846 setupModulePath(sys.argv[0])
2848 host = socket.gethostname()
2850 # the PRNG is normally seeded with time(), which is not so good for starting
2851 # time-synchronized clusters
2852 input = open('/dev/urandom', 'r')
2854 print 'Unable to open /dev/urandom!'
2856 seed = input.read(32)
2862 init_select(config.select)
2865 # allow config to be fetched via HTTP, but only with python2
2866 if sys.version[0] != '1' and args[0].startswith('http://'):
2869 config_file = urllib2.urlopen(args[0])
2870 except (urllib2.URLError, socket.error), err:
2871 if hasattr(err, 'args'):
2873 print "Could not access '%s': %s" %(args[0], err)
2875 elif not os.access(args[0], os.R_OK):
2876 print 'File not found or readable:', args[0]
2880 config_file = open(args[0], 'r')
2882 dom = xml.dom.minidom.parse(config_file)
2884 panic("%s does not appear to be a config file." % (args[0]))
2885 sys.exit(1) # make sure to die here, even in debug mode.
2887 CONFIG_FILE = args[0]
2888 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2889 if not config.config:
2890 config.config = os.path.basename(args[0])# use full path?
2891 if config.config[-4:] == '.xml':
2892 config.config = config.config[:-4]
2893 elif config.ldapurl:
2894 if not config.config:
2895 panic("--ldapurl requires --config name")
2896 dn = "config=%s,fs=lustre" % (config.config)
2897 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2898 elif config.ptldebug or config.subsystem:
2899 sys_set_ptldebug(None)
2900 sys_set_subsystem(None)
2903 print 'Missing config file or ldap URL.'
2904 print 'see lconf --help for command summary'
2907 toplustreDB = lustreDB
2909 ver = lustreDB.get_version()
2911 panic("No version found in config data, please recreate.")
2912 if ver != Lustre.CONFIG_VERSION:
2913 panic("Config version", ver, "does not match lconf version",
2914 Lustre.CONFIG_VERSION)
2918 node_list.append(config.node)
2921 node_list.append(host)
2922 node_list.append('localhost')
2924 debug("configuring for host: ", node_list)
2927 config.debug_path = config.debug_path + '-' + host
2928 config.gdb_script = config.gdb_script + '-' + host
2930 lctl = LCTLInterface('lctl')
2932 if config.lctl_dump:
2933 lctl.use_save_file(config.lctl_dump)
2936 if not (config.record_device and config.record_log):
2937 panic("When recording, both --record_log and --record_device must be specified.")
2938 lctl.clear_log(config.record_device, config.record_log)
2939 lctl.record(config.record_device, config.record_log)
2941 doHost(lustreDB, node_list)
2946 if __name__ == "__main__":
2949 except Lustre.LconfError, e:
2951 # traceback.print_exc(file=sys.stdout)
2953 except CommandError, e:
2957 if first_cleanup_error:
2958 sys.exit(first_cleanup_error)