3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
96 "undefined" : (1 << 0),
106 "portals" : (1 << 10),
107 "socknal" : (1 << 11),
108 "qswnal" : (1 << 12),
109 "pinger" : (1 << 13),
110 "filter" : (1 << 14),
116 "ptlrouter" : (1 << 20),
121 "confobd" : (1 << 25),
128 first_cleanup_error = 0
129 def cleanup_error(rc):
130 global first_cleanup_error
131 if not first_cleanup_error:
132 first_cleanup_error = rc
134 # ============================================================
135 # debugging and error funcs
137 def fixme(msg = "this feature"):
138 raise Lustre.LconfError, msg + ' not implemented yet.'
141 msg = string.join(map(str,args))
142 if not config.noexec:
143 raise Lustre.LconfError(msg)
148 msg = string.join(map(str,args))
153 print string.strip(s)
157 msg = string.join(map(str,args))
160 # ack, python's builtin int() does not support '0x123' syntax.
161 # eval can do it, although what a hack!
165 return eval(s, {}, {})
168 except SyntaxError, e:
169 raise ValueError("not a number")
171 raise ValueError("not a number")
173 # ============================================================
174 # locally defined exceptions
175 class CommandError (exceptions.Exception):
176 def __init__(self, cmd_name, cmd_err, rc=None):
177 self.cmd_name = cmd_name
178 self.cmd_err = cmd_err
183 if type(self.cmd_err) == types.StringType:
185 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
187 print "! %s: %s" % (self.cmd_name, self.cmd_err)
188 elif type(self.cmd_err) == types.ListType:
190 print "! %s (error %d):" % (self.cmd_name, self.rc)
192 print "! %s:" % (self.cmd_name)
193 for s in self.cmd_err:
194 print "> %s" %(string.strip(s))
199 # ============================================================
200 # handle daemons, like the acceptor
202 """ Manage starting and stopping a daemon. Assumes daemon manages
203 it's own pid file. """
205 def __init__(self, cmd):
211 log(self.command, "already running.")
213 self.path = find_prog(self.command)
215 panic(self.command, "not found.")
216 ret, out = runcmd(self.path +' '+ self.command_line())
218 # FIXME: add this check can only narrow the race but can not avoid it
219 # completely, so I don't apply this method on inserting module.
220 if ret and not self.running():
221 raise CommandError(self.path, out, ret)
225 pid = self.read_pidfile()
227 log ("killing process", pid)
229 #time.sleep(1) # let daemon die
231 log("unable to kill", self.command, e)
233 log("unable to kill", self.command)
236 pid = self.read_pidfile()
246 def read_pidfile(self):
248 fp = open(self.pidfile(), 'r')
255 print "WARNING: invalid pid in %s, removed" % self.pidfile()
256 print "WARNING: You may need to stop acceptor by yourself and then unload the module libcfs"
257 os.unlink(self.pidfile())
260 def clean_pidfile(self):
261 """ Remove a stale pidfile """
262 log("removing stale pidfile:", self.pidfile())
264 os.unlink(self.pidfile())
266 log(self.pidfile(), e)
268 class AcceptorHandler(DaemonHandler):
269 def __init__(self, port, net_type):
270 DaemonHandler.__init__(self, "acceptor")
275 return "/var/run/%s-%d.pid" % (self.command, self.port)
277 def command_line(self):
278 return string.join(map(str,(self.flags, self.port)))
282 # start the acceptors
284 if config.lctl_dump or config.record:
286 for port in acceptors.keys():
287 daemon = acceptors[port]
288 if not daemon.running():
291 def run_one_acceptor(port):
292 if config.lctl_dump or config.record:
294 if acceptors.has_key(port):
295 daemon = acceptors[port]
296 if not daemon.running():
299 panic("run_one_acceptor: No acceptor defined for port:", port)
301 def stop_acceptor(port):
302 if acceptors.has_key(port):
303 daemon = acceptors[port]
308 # ============================================================
309 # handle lctl interface
312 Manage communication with lctl
315 def __init__(self, cmd):
317 Initialize close by finding the lctl binary.
319 self.lctl = find_prog(cmd)
321 self.record_device = ''
324 debug('! lctl not found')
327 raise CommandError('lctl', "unable to find lctl binary.")
329 def use_save_file(self, file):
330 self.save_file = file
332 def record(self, dev_name, logname):
333 log("Recording log", logname, "on", dev_name)
334 self.record_device = dev_name
335 self.record_log = logname
337 def end_record(self):
338 log("End recording log", self.record_log, "on", self.record_device)
339 self.record_device = None
340 self.record_log = None
342 def set_nonblock(self, fd):
343 fl = fcntl.fcntl(fd, F_GETFL)
344 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
349 the cmds are written to stdin of lctl
350 lctl doesn't return errors when run in script mode, so
352 should modify command line to accept multiple commands, or
353 create complex command line options
357 cmds = '\n dump ' + self.save_file + '\n' + cmds
358 elif self.record_device:
362 %s""" % (self.record_device, self.record_log, cmds)
364 debug("+", cmd_line, cmds)
365 if config.noexec: return (0, [])
367 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
368 child.tochild.write(cmds + "\n")
369 child.tochild.close()
371 # From "Python Cookbook" from O'Reilly
372 outfile = child.fromchild
373 outfd = outfile.fileno()
374 self.set_nonblock(outfd)
375 errfile = child.childerr
376 errfd = errfile.fileno()
377 self.set_nonblock(errfd)
379 outdata = errdata = ''
382 ready = select.select([outfd,errfd],[],[]) # Wait for input
383 if outfd in ready[0]:
384 outchunk = outfile.read()
385 if outchunk == '': outeof = 1
386 outdata = outdata + outchunk
387 if errfd in ready[0]:
388 errchunk = errfile.read()
389 if errchunk == '': erreof = 1
390 errdata = errdata + errchunk
391 if outeof and erreof: break
392 # end of "borrowed" code
395 if os.WIFEXITED(ret):
396 rc = os.WEXITSTATUS(ret)
399 if rc or len(errdata):
400 raise CommandError(self.lctl, errdata, rc)
403 def runcmd(self, *args):
405 run lctl using the command line
407 cmd = string.join(map(str,args))
408 debug("+", self.lctl, cmd)
409 rc, out = run(self.lctl, cmd)
411 raise CommandError(self.lctl, out, rc)
415 def clear_log(self, dev, log):
416 """ clear an existing log """
421 quit """ % (dev, log)
424 def network(self, net, nid):
429 quit """ % (net, nid)
433 def add_interface(self, net, ip, netmask = ""):
434 """ add an interface """
438 quit """ % (net, ip, netmask)
441 # delete an interface
442 def del_interface(self, net, ip):
443 """ delete an interface """
450 # create a new connection
451 def add_uuid(self, net_type, uuid, nid):
452 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
455 def add_peer(self, net_type, nid, hostaddr, port):
456 if net_type in ('tcp',) and not config.lctl_dump:
461 nid, hostaddr, port )
463 elif net_type in ('openib','iib',) and not config.lctl_dump:
471 def connect(self, srv):
472 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
473 if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump:
475 hostaddr = string.split(srv.hostaddr[0], '/')[0]
476 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
479 def recover(self, dev_name, new_conn):
482 recover %s""" %(dev_name, new_conn)
485 # add a route to a range
486 def add_route(self, net, gw, lo, hi):
494 except CommandError, e:
498 def del_route(self, net, gw, lo, hi):
503 quit """ % (net, gw, lo, hi)
506 # add a route to a host
507 def add_route_host(self, net, uuid, gw, tgt):
508 self.add_uuid(net, uuid, tgt)
516 except CommandError, e:
520 # add a route to a range
521 def del_route_host(self, net, uuid, gw, tgt):
527 quit """ % (net, gw, tgt)
531 def del_peer(self, net_type, nid, hostaddr):
532 if net_type in ('tcp',) and not config.lctl_dump:
536 del_peer %s %s single_share
540 elif net_type in ('openib','iib',) and not config.lctl_dump:
544 del_peer %s single_share
549 # disconnect one connection
550 def disconnect(self, srv):
551 self.del_uuid(srv.nid_uuid)
552 if srv.net_type in ('tcp','openib','iib',) and not config.lctl_dump:
554 hostaddr = string.split(srv.hostaddr[0], '/')[0]
555 self.del_peer(srv.net_type, srv.nid, hostaddr)
557 def del_uuid(self, uuid):
565 def disconnectAll(self, net):
573 def attach(self, type, name, uuid):
576 quit""" % (type, name, uuid)
579 def setup(self, name, setup = ""):
583 quit""" % (name, setup)
587 # create a new device with lctl
588 def newdev(self, type, name, uuid, setup = ""):
589 self.attach(type, name, uuid);
591 self.setup(name, setup)
592 except CommandError, e:
593 self.cleanup(name, uuid, 0)
598 def cleanup(self, name, uuid, force, failover = 0):
599 if failover: force = 1
605 quit""" % (name, ('', 'force')[force],
606 ('', 'failover')[failover])
610 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
611 stripe_sz, stripe_off,
615 lov_setup %s %d %d %d %s %s
616 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
621 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
625 lov_setconfig %s %d %d %d %s %s
626 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
630 def dump(self, dump_file):
633 quit""" % (dump_file)
636 # get list of devices
637 def device_list(self):
638 devices = '/proc/fs/lustre/devices'
640 if os.access(devices, os.R_OK):
642 fp = open(devices, 'r')
650 def lustre_version(self):
651 rc, out = self.runcmd('version')
655 def mount_option(self, profile, osc, mdc):
657 mount_option %s %s %s
658 quit""" % (profile, osc, mdc)
661 # delete mount options
662 def del_mount_option(self, profile):
668 def set_timeout(self, timeout):
674 # delete mount options
675 def set_lustre_upcall(self, upcall):
680 # ============================================================
681 # Various system-level functions
682 # (ideally moved to their own module)
684 # Run a command and return the output and status.
685 # stderr is sent to /dev/null, could use popen3 to
686 # save it if necessary
689 if config.noexec: return (0, [])
690 f = os.popen(cmd + ' 2>&1')
700 cmd = string.join(map(str,args))
703 # Run a command in the background.
704 def run_daemon(*args):
705 cmd = string.join(map(str,args))
707 if config.noexec: return 0
708 f = os.popen(cmd + ' 2>&1')
716 # Determine full path to use for an external command
717 # searches dirname(argv[0]) first, then PATH
719 syspath = string.split(os.environ['PATH'], ':')
720 cmdpath = os.path.dirname(sys.argv[0])
721 syspath.insert(0, cmdpath);
723 syspath.insert(0, os.path.join(config.portals, 'utils/'))
725 prog = os.path.join(d,cmd)
726 if os.access(prog, os.X_OK):
730 # Recursively look for file starting at base dir
731 def do_find_file(base, mod):
732 fullname = os.path.join(base, mod)
733 if os.access(fullname, os.R_OK):
735 for d in os.listdir(base):
736 dir = os.path.join(base,d)
737 if os.path.isdir(dir):
738 module = do_find_file(dir, mod)
742 def find_module(src_dir, dev_dir, modname):
743 modbase = src_dir +'/'+ dev_dir +'/'+ modname
744 for modext in '.ko', '.o':
745 module = modbase + modext
747 if os.access(module, os.R_OK):
753 # is the path a block device?
760 return stat.S_ISBLK(s[stat.ST_MODE])
762 # build fs according to type
764 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
770 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
772 # devsize is in 1k, and fs block count is in 4k
773 block_cnt = devsize/4
775 if fstype in ('ext3', 'extN', 'ldiskfs'):
776 # ext3 journal size is in megabytes
779 if not is_block(dev):
780 ret, out = runcmd("ls -l %s" %dev)
781 devsize = int(string.split(out[0])[4]) / 1024
783 # sfdisk works for symlink, hardlink, and realdev
784 ret, out = runcmd("sfdisk -s %s" %dev)
786 devsize = int(out[0])
788 # sfdisk -s will fail for too large block device,
789 # then, read the size of partition from /proc/partitions
791 # get the realpath of the device
792 # it may be the real device, such as /dev/hda7
793 # or the hardlink created via mknod for a device
794 if 'realpath' in dir(os.path):
795 real_dev = os.path.realpath(dev)
799 while os.path.islink(real_dev) and (link_count < 20):
800 link_count = link_count + 1
801 dev_link = os.readlink(real_dev)
802 if os.path.isabs(dev_link):
805 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
807 panic("Entountered too many symbolic links resolving block device:", dev)
809 # get the major and minor number of the realpath via ls
810 # it seems python(os.stat) does not return
811 # the st_rdev member of the stat structure
812 ret, out = runcmd("ls -l %s" %real_dev)
813 major = string.split(string.split(out[0])[4], ",")[0]
814 minor = string.split(out[0])[5]
816 # get the devsize from /proc/partitions with the major and minor number
817 ret, out = runcmd("cat /proc/partitions")
820 if string.split(line)[0] == major and string.split(line)[1] == minor:
821 devsize = int(string.split(line)[2])
824 if devsize > 1024 * 1024:
825 jsize = ((devsize / 102400) * 4)
828 if jsize: jopt = "-J size=%d" %(jsize,)
829 if isize: iopt = "-I %d" %(isize,)
830 mkfs = 'mkfs.ext2 -j -b 4096 '
831 if not isblock or config.force:
833 elif fstype == 'reiserfs':
834 # reiserfs journal size is in blocks
835 if jsize: jopt = "--journal_size %d" %(jsize,)
836 mkfs = 'mkreiserfs -ff'
838 panic('unsupported fs type: ', fstype)
840 if config.mkfsoptions != None:
841 mkfs = mkfs + ' ' + config.mkfsoptions
842 if mkfsoptions != None:
843 mkfs = mkfs + ' ' + mkfsoptions
844 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
846 panic("Unable to build fs:", dev, string.join(out))
847 # enable hash tree indexing on fsswe
848 if fstype in ('ext3', 'extN', 'ldiskfs'):
849 htree = 'echo "feature FEATURE_C5" | debugfs -w'
850 (ret, out) = run (htree, dev)
852 panic("Unable to enable htree:", dev)
854 # some systems use /dev/loopN, some /dev/loop/N
858 if not os.access(loop + str(0), os.R_OK):
860 if not os.access(loop + str(0), os.R_OK):
861 panic ("can't access loop devices")
864 # find loop device assigned to thefile
867 for n in xrange(0, MAX_LOOP_DEVICES):
869 if os.access(dev, os.R_OK):
870 (stat, out) = run('losetup', dev)
871 if out and stat == 0:
872 m = re.search(r'\((.*)\)', out[0])
873 if m and file == m.group(1):
879 # create file if necessary and assign the first free loop device
880 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
881 dev = find_loop(file)
883 print 'WARNING file:', file, 'already mapped to', dev
885 if reformat or not os.access(file, os.R_OK | os.W_OK):
887 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
888 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
891 panic("Unable to create backing store:", file)
892 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
895 # find next free loop
896 for n in xrange(0, MAX_LOOP_DEVICES):
898 if os.access(dev, os.R_OK):
899 (stat, out) = run('losetup', dev)
901 run('losetup', dev, file)
904 print "out of loop devices"
906 print "out of loop devices"
909 # undo loop assignment
910 def clean_loop(file):
911 dev = find_loop(file)
913 ret, out = run('losetup -d', dev)
915 log('unable to clean loop device:', dev, 'for file:', file)
918 # determine if dev is formatted as a <fstype> filesystem
919 def need_format(fstype, dev):
920 # FIXME don't know how to implement this
923 # initialize a block device if needed
924 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
925 inode_size, mkfsoptions):
926 if config.noexec: return dev
927 if not is_block(dev):
928 dev = init_loop(dev, size, fstype, journal_size, inode_size,
929 mkfsoptions, reformat)
930 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
931 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
934 # panic("device:", dev,
935 # "not prepared, and autoformat is not set.\n",
936 # "Rerun with --reformat option to format ALL filesystems")
941 """lookup IP address for an interface"""
942 rc, out = run("/sbin/ifconfig", iface)
945 addr = string.split(out[1])[1]
946 ip = string.split(addr, ':')[1]
949 def def_mount_options(fstype, target):
950 """returns deafult mount options for passed fstype and target (mds, ost)"""
951 if fstype == 'ext3' or fstype == 'ldiskfs':
952 mountfsoptions = "errors=remount-ro"
953 if target == 'ost' and sys_get_branch() == '2.4':
954 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
955 return mountfsoptions
958 def sys_get_elan_position_file():
959 procfiles = ["/proc/elan/device0/position",
960 "/proc/qsnet/elan4/device0/position",
961 "/proc/qsnet/elan3/device0/position"]
963 if os.access(p, os.R_OK):
967 def sys_get_local_nid(net_type, wildcard, cluster_id):
968 """Return the local nid."""
970 if sys_get_elan_position_file():
971 local = sys_get_local_address('elan', '*', cluster_id)
973 local = sys_get_local_address(net_type, wildcard, cluster_id)
976 def sys_get_local_address(net_type, wildcard, cluster_id):
977 """Return the local address for the network type."""
979 if net_type in ('tcp','openib','iib',):
981 iface, star = string.split(wildcard, ':')
982 local = if2addr(iface)
984 panic("unable to determine ip for:", wildcard)
986 host = socket.gethostname()
987 local = socket.gethostbyname(host)
988 elif net_type == 'elan':
989 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
990 f = sys_get_elan_position_file()
992 panic ("unable to determine local Elan ID")
995 lines = fp.readlines()
1003 nid = my_int(cluster_id) + my_int(elan_id)
1004 local = "%d" % (nid)
1005 except ValueError, e:
1009 elif net_type == 'lo':
1010 fixme("automatic local address for loopback")
1011 elif net_type == 'gm':
1012 fixme("automatic local address for GM")
1016 def sys_get_branch():
1017 """Returns kernel release"""
1019 fp = open('/proc/sys/kernel/osrelease')
1020 lines = fp.readlines()
1024 version = string.split(l)
1025 a = string.split(version[0], '.')
1026 return a[0] + '.' + a[1]
1031 def mod_loaded(modname):
1032 """Check if a module is already loaded. Look in /proc/modules for it."""
1034 fp = open('/proc/modules')
1035 lines = fp.readlines()
1037 # please forgive my tired fingers for this one
1038 ret = filter(lambda word, mod=modname: word == mod,
1039 map(lambda line: string.split(line)[0], lines))
1041 except Exception, e:
1044 # XXX: instead of device_list, ask for $name and see what we get
1045 def is_prepared(name):
1046 """Return true if a device exists for the name"""
1047 if config.lctl_dump:
1049 if (config.noexec or config.record) and config.cleanup:
1052 # expect this format:
1053 # 1 UP ldlm ldlm ldlm_UUID 2
1054 out = lctl.device_list()
1056 if name == string.split(s)[3]:
1058 except CommandError, e:
1062 def is_network_prepared():
1063 """If the any device exists, then assume that all networking
1064 has been configured"""
1065 out = lctl.device_list()
1068 def fs_is_mounted(path):
1069 """Return true if path is a mounted lustre filesystem"""
1071 fp = open('/proc/mounts')
1072 lines = fp.readlines()
1076 if a[1] == path and a[2] == 'lustre_lite':
1084 """Manage kernel modules"""
1085 def __init__(self, lustre_dir, portals_dir):
1086 self.lustre_dir = lustre_dir
1087 self.portals_dir = portals_dir
1088 self.kmodule_list = []
1090 def add_portals_module(self, dev_dir, modname):
1091 """Append a module to list of modules to load."""
1092 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1094 def add_lustre_module(self, dev_dir, modname):
1095 """Append a module to list of modules to load."""
1096 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1098 def load_module(self):
1099 """Load all the modules in the list in the order they appear."""
1100 for src_dir, dev_dir, mod in self.kmodule_list:
1101 if mod_loaded(mod) and not config.noexec:
1103 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1105 module = find_module(src_dir, dev_dir, mod)
1107 panic('module not found:', mod)
1108 (rc, out) = run('/sbin/insmod', module)
1109 if rc and not mod_loaded(mod):
1110 raise CommandError('insmod', out, rc)
1112 (rc, out) = run('/sbin/modprobe', mod)
1113 if rc and not mod_loaded(mod):
1114 raise CommandError('modprobe', out, rc)
1117 def cleanup_module(self):
1118 """Unload the modules in the list in reverse order."""
1119 rev = self.kmodule_list
1121 for src_dir, dev_dir, mod in rev:
1122 if not mod_loaded(mod) and not config.noexec:
1125 if mod == 'portals' and config.dump:
1126 lctl.dump(config.dump)
1127 log('unloading module:', mod)
1128 (rc, out) = run('/sbin/rmmod', mod)
1130 log('! unable to unload module:', mod)
1133 # ============================================================
1134 # Classes to prepare and cleanup the various objects
1137 """ Base class for the rest of the modules. The default cleanup method is
1138 defined here, as well as some utilitiy funcs.
1140 def __init__(self, module_name, db):
1142 self.module_name = module_name
1143 self.name = self.db.getName()
1144 self.uuid = self.db.getUUID()
1147 self.kmod = kmod(config.lustre, config.portals)
1149 def info(self, *args):
1150 msg = string.join(map(str,args))
1151 print self.module_name + ":", self.name, self.uuid, msg
1154 """ default cleanup, used for most modules """
1157 lctl.cleanup(self.name, self.uuid, config.force)
1158 except CommandError, e:
1159 log(self.module_name, "cleanup failed: ", self.name)
1163 def add_portals_module(self, dev_dir, modname):
1164 """Append a module to list of modules to load."""
1165 self.kmod.add_portals_module(dev_dir, modname)
1167 def add_lustre_module(self, dev_dir, modname):
1168 """Append a module to list of modules to load."""
1169 self.kmod.add_lustre_module(dev_dir, modname)
1171 def load_module(self):
1172 """Load all the modules in the list in the order they appear."""
1173 self.kmod.load_module()
1175 def cleanup_module(self):
1176 """Unload the modules in the list in reverse order."""
1177 if self.safe_to_clean():
1178 self.kmod.cleanup_module()
1180 def safe_to_clean(self):
1183 def safe_to_clean_modules(self):
1184 return self.safe_to_clean()
1186 class Network(Module):
1187 def __init__(self,db):
1188 Module.__init__(self, 'NETWORK', db)
1189 self.net_type = self.db.get_val('nettype')
1190 self.nid = self.db.get_val('nid', '*')
1191 self.cluster_id = self.db.get_val('clusterid', "0")
1192 self.port = self.db.get_val_int('port', 0)
1195 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1197 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1198 self.generic_nid = 1
1199 debug("nid:", self.nid)
1201 self.generic_nid = 0
1203 self.nid_uuid = self.nid_to_uuid(self.nid)
1205 self.hostaddr = self.db.get_hostaddr()
1206 if len(self.hostaddr) == 0:
1207 self.hostaddr.append(self.nid)
1208 if '*' in self.hostaddr[0]:
1209 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1210 if not self.hostaddr[0]:
1211 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1212 debug("hostaddr:", self.hostaddr[0])
1214 self.add_portals_module("libcfs", 'libcfs')
1215 self.add_portals_module("portals", 'portals')
1216 if node_needs_router():
1217 self.add_portals_module("router", 'kptlrouter')
1218 if self.net_type == 'tcp':
1219 self.add_portals_module("knals/socknal", 'ksocknal')
1220 if self.net_type == 'elan':
1221 self.add_portals_module("knals/qswnal", 'kqswnal')
1222 if self.net_type == 'gm':
1223 self.add_portals_module("knals/gmnal", 'kgmnal')
1224 if self.net_type == 'openib':
1225 self.add_portals_module("knals/openibnal", 'kopenibnal')
1226 if self.net_type == 'iib':
1227 self.add_portals_module("knals/iibnal", 'kiibnal')
1228 if self.net_type == 'lo':
1229 self.add_portals_module("knals/lonal", 'klonal')
1231 def nid_to_uuid(self, nid):
1232 return "NID_%s_UUID" %(nid,)
1235 if is_network_prepared():
1237 self.info(self.net_type, self.nid, self.port)
1238 if not (config.record and self.generic_nid):
1239 lctl.network(self.net_type, self.nid)
1240 if self.net_type == 'tcp':
1242 for hostaddr in self.db.get_hostaddr():
1243 ip = string.split(hostaddr, '/')[0]
1244 if len(string.split(hostaddr, '/')) == 2:
1245 netmask = string.split(hostaddr, '/')[1]
1248 lctl.add_interface(self.net_type, ip, netmask)
1249 if self.net_type == 'elan':
1251 if self.port and node_is_router():
1252 run_one_acceptor(self.port)
1253 self.connect_peer_gateways()
1255 def connect_peer_gateways(self):
1256 for router in self.db.lookup_class('node'):
1257 if router.get_val_int('router', 0):
1258 for netuuid in router.get_networks():
1259 net = self.db.lookup(netuuid)
1261 if (gw.cluster_id == self.cluster_id and
1262 gw.net_type == self.net_type):
1263 if gw.nid != self.nid:
1266 def disconnect_peer_gateways(self):
1267 for router in self.db.lookup_class('node'):
1268 if router.get_val_int('router', 0):
1269 for netuuid in router.get_networks():
1270 net = self.db.lookup(netuuid)
1272 if (gw.cluster_id == self.cluster_id and
1273 gw.net_type == self.net_type):
1274 if gw.nid != self.nid:
1277 except CommandError, e:
1278 print "disconnect failed: ", self.name
1282 def safe_to_clean(self):
1283 return not is_network_prepared()
1286 self.info(self.net_type, self.nid, self.port)
1288 stop_acceptor(self.port)
1289 if node_is_router():
1290 self.disconnect_peer_gateways()
1291 if self.net_type == 'tcp':
1292 for hostaddr in self.db.get_hostaddr():
1293 ip = string.split(hostaddr, '/')[0]
1294 lctl.del_interface(self.net_type, ip)
1296 class RouteTable(Module):
1297 def __init__(self,db):
1298 Module.__init__(self, 'ROUTES', db)
1300 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1302 # only setup connections for tcp, openib, and iib NALs
1304 if not net_type in ('tcp','openib','iib',):
1307 # connect to target if route is to single node and this node is the gw
1308 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1309 if not local_cluster(net_type, tgt_cluster_id):
1310 panic("target", lo, " not on the local cluster")
1311 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1312 # connect to gateway if this node is not the gw
1313 elif (local_cluster(net_type, gw_cluster_id)
1314 and not local_interface(net_type, gw_cluster_id, gw)):
1315 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1320 panic("no server for nid", lo)
1323 return Network(srvdb)
1326 if is_network_prepared():
1329 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1330 lctl.add_route(net_type, gw, lo, hi)
1331 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1335 def safe_to_clean(self):
1336 return not is_network_prepared()
1339 if is_network_prepared():
1340 # the network is still being used, don't clean it up
1342 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1343 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1346 lctl.disconnect(srv)
1347 except CommandError, e:
1348 print "disconnect failed: ", self.name
1353 lctl.del_route(net_type, gw, lo, hi)
1354 except CommandError, e:
1355 print "del_route failed: ", self.name
1359 class Management(Module):
1360 def __init__(self, db):
1361 Module.__init__(self, 'MGMT', db)
1362 self.add_lustre_module('lvfs', 'lvfs')
1363 self.add_lustre_module('obdclass', 'obdclass')
1364 self.add_lustre_module('ptlrpc', 'ptlrpc')
1365 self.add_lustre_module('mgmt', 'mgmt_svc')
1368 if is_prepared(self.name):
1371 lctl.newdev("mgmt", self.name, self.uuid)
1373 def safe_to_clean(self):
1377 if is_prepared(self.name):
1378 Module.cleanup(self)
1380 # This is only needed to load the modules; the LDLM device
1381 # is now created automatically.
1383 def __init__(self,db):
1384 Module.__init__(self, 'LDLM', db)
1385 self.add_lustre_module('lvfs', 'lvfs')
1386 self.add_lustre_module('obdclass', 'obdclass')
1387 self.add_lustre_module('ptlrpc', 'ptlrpc')
1396 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1397 Module.__init__(self, 'LOV', db)
1398 if name_override != None:
1399 self.name = "lov_%s" % name_override
1400 self.add_lustre_module('lov', 'lov')
1401 self.mds_uuid = self.db.get_first_ref('mds')
1402 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1403 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1404 self.pattern = self.db.get_val_int('stripepattern', 0)
1405 self.devlist = self.db.get_refs('obd')
1406 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1408 self.desc_uuid = self.uuid
1409 self.uuid = generate_client_uuid(self.name)
1410 self.fs_name = fs_name
1412 self.config_only = 1
1414 self.config_only = None
1415 mds= self.db.lookup(self.mds_uuid)
1416 self.mds_name = mds.getName()
1417 for obd_uuid in self.devlist:
1418 obd = self.db.lookup(obd_uuid)
1419 osc = get_osc(obd, self.uuid, fs_name)
1421 self.osclist.append(osc)
1423 panic('osc not found:', obd_uuid)
1426 if is_prepared(self.name):
1428 if self.config_only:
1429 panic("Can't prepare config_only LOV ", self.name)
1431 for osc in self.osclist:
1433 # Only ignore connect failures with --force, which
1434 # isn't implemented here yet.
1435 osc.prepare(ignore_connect_failure=0)
1436 except CommandError, e:
1437 print "Error preparing OSC %s\n" % osc.uuid
1439 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1440 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1441 lctl.lov_setup(self.name, self.uuid,
1442 self.desc_uuid, self.mds_name, self.stripe_cnt,
1443 self.stripe_sz, self.stripe_off, self.pattern,
1444 string.join(self.devlist))
1447 if is_prepared(self.name):
1448 Module.cleanup(self)
1449 if self.config_only:
1450 panic("Can't clean up config_only LOV ", self.name)
1451 for osc in self.osclist:
1454 def load_module(self):
1455 if self.config_only:
1456 panic("Can't load modules for config_only LOV ", self.name)
1457 for osc in self.osclist:
1460 Module.load_module(self)
1462 def cleanup_module(self):
1463 if self.config_only:
1464 panic("Can't cleanup modules for config_only LOV ", self.name)
1465 Module.cleanup_module(self)
1466 for osc in self.osclist:
1467 osc.cleanup_module()
1470 class MDSDEV(Module):
1471 def __init__(self,db):
1472 Module.__init__(self, 'MDSDEV', db)
1473 self.devpath = self.db.get_val('devpath','')
1474 self.size = self.db.get_val_int('devsize', 0)
1475 self.journal_size = self.db.get_val_int('journalsize', 0)
1476 self.fstype = self.db.get_val('fstype', '')
1477 self.nspath = self.db.get_val('nspath', '')
1478 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1479 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1480 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1481 target_uuid = self.db.get_first_ref('target')
1482 mds = self.db.lookup(target_uuid)
1483 self.name = mds.getName()
1484 self.filesystem_uuids = mds.get_refs('filesystem')
1485 # FIXME: if fstype not set, then determine based on kernel version
1486 self.format = self.db.get_val('autoformat', "no")
1487 if mds.get_val('failover', 0):
1488 self.failover_mds = 'f'
1490 self.failover_mds = 'n'
1491 active_uuid = get_active_target(mds)
1493 panic("No target device found:", target_uuid)
1494 if active_uuid == self.uuid:
1498 if self.active and config.group and config.group != mds.get_val('group'):
1501 self.inode_size = self.db.get_val_int('inodesize', 0)
1502 if self.inode_size == 0:
1503 # find the LOV for this MDS
1504 lovconfig_uuid = mds.get_first_ref('lovconfig')
1505 if not lovconfig_uuid:
1506 panic("No LOV config found for MDS ", mds.name)
1507 lovconfig = mds.lookup(lovconfig_uuid)
1508 lov_uuid = lovconfig.get_first_ref('lov')
1510 panic("No LOV found for lovconfig ", lovconfig.name)
1511 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1513 # default stripe count controls default inode_size
1514 if (lov.stripe_cnt > 0):
1515 stripe_count = lov.stripe_cnt
1517 stripe_count = len(lov.devlist)
1518 if stripe_count > 77:
1519 self.inode_size = 4096
1520 elif stripe_count > 35:
1521 self.inode_size = 2048
1522 elif stripe_count > 13:
1523 self.inode_size = 1024
1524 elif stripe_count > 3:
1525 self.inode_size = 512
1527 self.inode_size = 256
1529 self.target_dev_uuid = self.uuid
1530 self.uuid = target_uuid
1533 self.add_lustre_module('mdc', 'mdc')
1534 self.add_lustre_module('osc', 'osc')
1535 self.add_lustre_module('lov', 'lov')
1536 self.add_lustre_module('mds', 'mds')
1537 if self.fstype == 'ldiskfs':
1538 self.add_lustre_module('ldiskfs', 'ldiskfs')
1540 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1542 def load_module(self):
1544 Module.load_module(self)
1547 if is_prepared(self.name):
1550 debug(self.uuid, "not active")
1553 # run write_conf automatically, if --reformat used
1555 self.info(self.devpath, self.fstype, self.size, self.format)
1557 # never reformat here
1558 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1559 self.format, self.journal_size, self.inode_size,
1561 if not is_prepared('MDT'):
1562 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1564 mountfsoptions = def_mount_options(self.fstype, 'mds')
1566 if config.mountfsoptions:
1568 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1570 mountfsoptions = config.mountfsoptions
1571 if self.mountfsoptions:
1572 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1574 if self.mountfsoptions:
1576 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1578 mountfsoptions = self.mountfsoptions
1580 print 'MDS mount options: ' + mountfsoptions
1582 lctl.newdev("mds", self.name, self.uuid,
1583 setup ="%s %s %s %s" %(blkdev, self.fstype, self.name, mountfsoptions))
1584 except CommandError, e:
1586 panic("MDS is missing the config log. Need to run " +
1587 "lconf --write_conf.")
1591 def write_conf(self):
1592 if is_prepared(self.name):
1594 self.info(self.devpath, self.fstype, self.format)
1595 blkdev = block_dev(self.devpath, self.size, self.fstype,
1596 config.reformat, self.format, self.journal_size,
1597 self.inode_size, self.mkfsoptions)
1598 lctl.newdev("mds", self.name, self.uuid,
1599 setup ="%s %s" %(blkdev, self.fstype))
1601 # record logs for the MDS lov
1602 for uuid in self.filesystem_uuids:
1603 log("recording clients for filesystem:", uuid)
1604 fs = self.db.lookup(uuid)
1605 obd_uuid = fs.get_first_ref('obd')
1606 client_uuid = generate_client_uuid(self.name)
1607 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1610 lctl.clear_log(self.name, self.name)
1611 lctl.record(self.name, self.name)
1613 lctl.mount_option(self.name, client.get_name(), "")
1617 lctl.clear_log(self.name, self.name + '-clean')
1618 lctl.record(self.name, self.name + '-clean')
1620 lctl.del_mount_option(self.name)
1625 # record logs for each client
1627 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1629 config_options = CONFIG_FILE
1631 for node_db in self.db.lookup_class('node'):
1632 client_name = node_db.getName()
1633 for prof_uuid in node_db.get_refs('profile'):
1634 prof_db = node_db.lookup(prof_uuid)
1635 # refactor this into a funtion to test "clientness"
1637 for ref_class, ref_uuid in prof_db.get_all_refs():
1638 if ref_class in ('mountpoint','echoclient'):
1639 debug("recording", client_name)
1640 old_noexec = config.noexec
1642 noexec_opt = ('', '-n')
1643 ret, out = run (sys.argv[0],
1644 noexec_opt[old_noexec == 1],
1645 " -v --record --nomod",
1646 "--record_log", client_name,
1647 "--record_device", self.name,
1648 "--node", client_name,
1651 lctl.clear_log(self.name, client_name)
1654 panic("Record client log %s on %s failed" %(
1655 client_name, self.name))
1657 for s in out: log("record> ", string.strip(s))
1658 ret, out = run (sys.argv[0],
1659 noexec_opt[old_noexec == 1],
1660 "--cleanup -v --record --nomod",
1661 "--record_log", client_name + "-clean",
1662 "--record_device", self.name,
1663 "--node", client_name,
1666 # In this case, although 0-conf mount works but 0-conf umount
1667 # doesn't work. As a boring result, the user is forced to
1668 # cleanup client service manually again and again. So I prefer
1669 # deleting these two llogs together and let the user write_conf.
1670 lctl.clear_log(self.name, client_name)
1671 lctl.clear_log(self.name, client_name + '-clean')
1674 panic("Record client log %s on %s failed" %(
1675 client_name + '-clean', self.name))
1677 for s in out: log("record> ", string.strip(s))
1678 config.noexec = old_noexec
1680 lctl.cleanup(self.name, self.uuid, 0, 0)
1681 except CommandError, e:
1682 log(self.module_name, "cleanup failed: ", self.name)
1685 Module.cleanup(self)
1686 clean_loop(self.devpath)
1688 def msd_remaining(self):
1689 out = lctl.device_list()
1691 if string.split(s)[2] in ('mds',):
1694 def safe_to_clean(self):
1697 def safe_to_clean_modules(self):
1698 return not self.msd_remaining()
1702 debug(self.uuid, "not active")
1705 if is_prepared(self.name):
1707 lctl.cleanup(self.name, self.uuid, config.force,
1709 except CommandError, e:
1710 log(self.module_name, "cleanup failed: ", self.name)
1713 Module.cleanup(self)
1714 if not self.msd_remaining() and is_prepared('MDT'):
1716 lctl.cleanup("MDT", "MDT_UUID", config.force,
1718 except CommandError, e:
1719 print "cleanup failed: ", self.name
1722 clean_loop(self.devpath)
1725 def __init__(self, db):
1726 Module.__init__(self, 'OSD', db)
1727 self.osdtype = self.db.get_val('osdtype')
1728 self.devpath = self.db.get_val('devpath', '')
1729 self.size = self.db.get_val_int('devsize', 0)
1730 self.journal_size = self.db.get_val_int('journalsize', 0)
1731 self.inode_size = self.db.get_val_int('inodesize', 0)
1732 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1733 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1734 self.fstype = self.db.get_val('fstype', '')
1735 self.nspath = self.db.get_val('nspath', '')
1736 target_uuid = self.db.get_first_ref('target')
1737 ost = self.db.lookup(target_uuid)
1738 self.name = ost.getName()
1739 self.format = self.db.get_val('autoformat', 'yes')
1740 if ost.get_val('failover', 0):
1741 self.failover_ost = 'f'
1743 self.failover_ost = 'n'
1745 active_uuid = get_active_target(ost)
1747 panic("No target device found:", target_uuid)
1748 if active_uuid == self.uuid:
1752 if self.active and config.group and config.group != ost.get_val('group'):
1755 self.target_dev_uuid = self.uuid
1756 self.uuid = target_uuid
1758 self.add_lustre_module('ost', 'ost')
1759 # FIXME: should we default to ext3 here?
1760 if self.fstype == 'ldiskfs':
1761 self.add_lustre_module('ldiskfs', 'ldiskfs')
1763 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1764 self.add_lustre_module(self.osdtype, self.osdtype)
1766 def load_module(self):
1768 Module.load_module(self)
1770 # need to check /proc/mounts and /etc/mtab before
1771 # formatting anything.
1772 # FIXME: check if device is already formatted.
1774 if is_prepared(self.name):
1777 debug(self.uuid, "not active")
1779 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1780 self.format, self.journal_size, self.inode_size)
1782 if self.osdtype == 'obdecho':
1785 blkdev = block_dev(self.devpath, self.size, self.fstype,
1786 config.reformat, self.format, self.journal_size,
1787 self.inode_size, self.mkfsoptions)
1789 mountfsoptions = def_mount_options(self.fstype, 'ost')
1791 if config.mountfsoptions:
1793 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1795 mountfsoptions = config.mountfsoptions
1796 if self.mountfsoptions:
1797 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1799 if self.mountfsoptions:
1801 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1803 mountfsoptions = self.mountfsoptions
1805 print 'OST mount options: ' + mountfsoptions
1807 lctl.newdev(self.osdtype, self.name, self.uuid,
1808 setup ="%s %s %s %s" %(blkdev, self.fstype,
1809 self.failover_ost, mountfsoptions))
1810 if not is_prepared('OSS'):
1811 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1813 def osd_remaining(self):
1814 out = lctl.device_list()
1816 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1819 def safe_to_clean(self):
1822 def safe_to_clean_modules(self):
1823 return not self.osd_remaining()
1827 debug(self.uuid, "not active")
1829 if is_prepared(self.name):
1832 lctl.cleanup(self.name, self.uuid, config.force,
1834 except CommandError, e:
1835 log(self.module_name, "cleanup failed: ", self.name)
1838 if not self.osd_remaining() and is_prepared('OSS'):
1840 lctl.cleanup("OSS", "OSS_UUID", config.force,
1842 except CommandError, e:
1843 print "cleanup failed: ", self.name
1846 if not self.osdtype == 'obdecho':
1847 clean_loop(self.devpath)
1849 def mgmt_uuid_for_fs(mtpt_name):
1852 mtpt_db = toplustreDB.lookup_name(mtpt_name)
1853 fs_uuid = mtpt_db.get_first_ref('filesystem')
1854 fs = toplustreDB.lookup(fs_uuid)
1857 return fs.get_first_ref('mgmt')
1859 # Generic client module, used by OSC and MDC
1860 class Client(Module):
1861 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1863 self.target_name = tgtdb.getName()
1864 self.target_uuid = tgtdb.getUUID()
1867 self.tgt_dev_uuid = get_active_target(tgtdb)
1868 if not self.tgt_dev_uuid:
1869 panic("No target device found for target:", self.target_name)
1871 self.kmod = kmod(config.lustre, config.portals)
1875 self.module = module
1876 self.module_name = string.upper(module)
1878 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1879 self.target_name, fs_name)
1881 self.name = self_name
1883 self.lookup_server(self.tgt_dev_uuid)
1884 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1886 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1889 self.fs_name = fs_name
1892 self.add_lustre_module(module_dir, module)
1894 def lookup_server(self, srv_uuid):
1895 """ Lookup a server's network information """
1896 self._server_nets = get_ost_net(self.db, srv_uuid)
1897 if len(self._server_nets) == 0:
1898 panic ("Unable to find a server for:", srv_uuid)
1900 def get_servers(self):
1901 return self._server_nets
1903 def prepare(self, ignore_connect_failure = 0):
1904 self.info(self.target_uuid)
1905 if is_prepared(self.name):
1908 srv = choose_local_server(self.get_servers())
1912 routes = find_route(self.get_servers())
1913 if len(routes) == 0:
1914 panic ("no route to", self.target_uuid)
1915 for (srv, r) in routes:
1916 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1917 except CommandError, e:
1918 if not ignore_connect_failure:
1921 if self.target_uuid in config.inactive and self.permits_inactive():
1922 debug("%s inactive" % self.target_uuid)
1923 inactive_p = "inactive"
1925 debug("%s active" % self.target_uuid)
1927 lctl.newdev(self.module, self.name, self.uuid,
1928 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1929 inactive_p, self.mgmt_name))
1932 if is_prepared(self.name):
1933 Module.cleanup(self)
1935 srv = choose_local_server(self.get_servers())
1937 lctl.disconnect(srv)
1939 for (srv, r) in find_route(self.get_servers()):
1940 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1941 except CommandError, e:
1942 log(self.module_name, "cleanup failed: ", self.name)
1948 def __init__(self, db, uuid, fs_name):
1949 Client.__init__(self, db, uuid, 'mdc', fs_name)
1951 def permits_inactive(self):
1955 def __init__(self, db, uuid, fs_name):
1956 Client.__init__(self, db, uuid, 'osc', fs_name)
1958 def permits_inactive(self):
1961 def mgmtcli_name_for_uuid(uuid):
1962 return 'MGMTCLI_%s' % uuid
1964 class ManagementClient(Client):
1965 def __init__(self, db, uuid):
1966 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1967 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1968 module_dir = 'mgmt')
1971 def __init__(self, db):
1972 Module.__init__(self, 'COBD', db)
1973 self.real_uuid = self.db.get_first_ref('realobd')
1974 self.cache_uuid = self.db.get_first_ref('cacheobd')
1975 self.add_lustre_module('cobd' , 'cobd')
1977 # need to check /proc/mounts and /etc/mtab before
1978 # formatting anything.
1979 # FIXME: check if device is already formatted.
1981 if is_prepared(self.name):
1983 self.info(self.real_uuid, self.cache_uuid)
1984 lctl.newdev("cobd", self.name, self.uuid,
1985 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1988 # virtual interface for OSC and LOV
1990 def __init__(self, db, uuid, fs_name, name_override = None):
1991 Module.__init__(self, 'VOSC', db)
1992 if db.get_class() == 'lov':
1993 self.osc = LOV(db, uuid, fs_name, name_override)
1995 self.osc = get_osc(db, uuid, fs_name)
1997 return self.osc.uuid
1999 return self.osc.name
2004 def load_module(self):
2005 self.osc.load_module()
2006 def cleanup_module(self):
2007 self.osc.cleanup_module()
2010 class ECHO_CLIENT(Module):
2011 def __init__(self,db):
2012 Module.__init__(self, 'ECHO_CLIENT', db)
2013 self.add_lustre_module('obdecho', 'obdecho')
2014 self.obd_uuid = self.db.get_first_ref('obd')
2015 obd = self.db.lookup(self.obd_uuid)
2016 self.uuid = generate_client_uuid(self.name)
2017 self.osc = VOSC(obd, self.uuid, self.name)
2020 if is_prepared(self.name):
2023 self.osc.prepare() # XXX This is so cheating. -p
2024 self.info(self.obd_uuid)
2026 lctl.newdev("echo_client", self.name, self.uuid,
2027 setup = self.osc.get_name())
2030 if is_prepared(self.name):
2031 Module.cleanup(self)
2034 def load_module(self):
2035 self.osc.load_module()
2036 Module.load_module(self)
2038 def cleanup_module(self):
2039 Module.cleanup_module(self)
2040 self.osc.cleanup_module()
2043 def generate_client_uuid(name):
2044 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2046 int(random.random() * 1048576),
2047 int(random.random() * 1048576))
2048 return client_uuid[:36]
2051 def my_rstrip(s, chars):
2052 """my_rstrip(s, chars) -> strips any instances of the characters
2053 found in chars from the right side of string s"""
2054 # XXX required because python versions pre 2.2.3 don't allow
2055 #string.rstrip() to take alternate char lists
2059 ns = string.rstrip(s, '/')
2060 except TypeError, e:
2061 for i in range(len(s) - 1, 0, -1):
2070 class Mountpoint(Module):
2071 def __init__(self,db):
2072 Module.__init__(self, 'MTPT', db)
2073 self.path = my_rstrip(self.db.get_val('path'), '/')
2074 self.clientoptions = self.db.get_val('clientoptions', '')
2075 self.fs_uuid = self.db.get_first_ref('filesystem')
2076 fs = self.db.lookup(self.fs_uuid)
2077 self.mds_uuid = fs.get_first_ref('mds')
2078 self.obd_uuid = fs.get_first_ref('obd')
2079 self.mgmt_uuid = fs.get_first_ref('mgmt')
2080 obd = self.db.lookup(self.obd_uuid)
2081 client_uuid = generate_client_uuid(self.name)
2082 self.vosc = VOSC(obd, client_uuid, self.name)
2083 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
2085 self.add_lustre_module('mdc', 'mdc')
2086 self.add_lustre_module('llite', 'llite')
2088 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2094 if fs_is_mounted(self.path):
2095 log(self.path, "already mounted.")
2099 self.mgmtcli.prepare()
2102 mdc_name = self.mdc.name
2104 self.info(self.path, self.mds_uuid, self.obd_uuid)
2105 if config.record or config.lctl_dump:
2106 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
2109 if config.clientoptions:
2110 if self.clientoptions:
2111 self.clientoptions = self.clientoptions + ',' + config.clientoptions
2113 self.clientoptions = config.clientoptions
2114 if self.clientoptions:
2115 self.clientoptions = ',' + self.clientoptions
2116 # Linux kernel will deal with async and not pass it to ll_fill_super,
2117 # so replace it with Lustre async
2118 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
2120 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2121 (self.vosc.get_name(), mdc_name, self.clientoptions, config.config, self.path)
2122 run("mkdir", self.path)
2127 panic("mount failed:", self.path, ":", string.join(val))
2130 self.info(self.path, self.mds_uuid,self.obd_uuid)
2132 if config.record or config.lctl_dump:
2133 lctl.del_mount_option(local_node_name)
2135 if fs_is_mounted(self.path):
2137 (rc, out) = run("umount", "-f", self.path)
2139 (rc, out) = run("umount", self.path)
2141 raise CommandError('umount', out, rc)
2143 if fs_is_mounted(self.path):
2144 panic("fs is still mounted:", self.path)
2149 self.mgmtcli.cleanup()
2151 def load_module(self):
2153 self.mgmtcli.load_module()
2154 self.vosc.load_module()
2155 Module.load_module(self)
2157 def cleanup_module(self):
2158 Module.cleanup_module(self)
2159 self.vosc.cleanup_module()
2161 self.mgmtcli.cleanup_module()
2164 # ============================================================
2165 # misc query functions
2167 def get_ost_net(self, osd_uuid):
2171 osd = self.lookup(osd_uuid)
2172 node_uuid = osd.get_first_ref('node')
2173 node = self.lookup(node_uuid)
2175 panic("unable to find node for osd_uuid:", osd_uuid,
2176 " node_ref:", node_uuid)
2177 for net_uuid in node.get_networks():
2178 db = node.lookup(net_uuid)
2179 srv_list.append(Network(db))
2183 # the order of iniitailization is based on level.
2184 def getServiceLevel(self):
2185 type = self.get_class()
2187 if type in ('network',):
2189 elif type in ('routetbl',):
2191 elif type in ('ldlm',):
2193 elif type in ('mgmt',):
2195 elif type in ('osd', 'cobd'):
2197 elif type in ('mdsdev',):
2199 elif type in ('mountpoint', 'echoclient'):
2202 panic("Unknown type: ", type)
2204 if ret < config.minlevel or ret > config.maxlevel:
2209 # return list of services in a profile. list is a list of tuples
2210 # [(level, db_object),]
2211 def getServices(self):
2213 for ref_class, ref_uuid in self.get_all_refs():
2214 servdb = self.lookup(ref_uuid)
2216 level = getServiceLevel(servdb)
2218 list.append((level, servdb))
2220 panic('service not found: ' + ref_uuid)
2226 ############################################################
2228 # FIXME: clean this mess up!
2230 # OSC is no longer in the xml, so we have to fake it.
2231 # this is getting ugly and begging for another refactoring
2232 def get_osc(ost_db, uuid, fs_name):
2233 osc = OSC(ost_db, uuid, fs_name)
2236 def get_mdc(db, uuid, fs_name, mds_uuid):
2237 mds_db = db.lookup(mds_uuid);
2239 panic("no mds:", mds_uuid)
2240 mdc = MDC(mds_db, uuid, fs_name)
2243 ############################################################
2244 # routing ("rooting")
2246 # list of (nettype, cluster_id, nid)
2249 def find_local_clusters(node_db):
2250 global local_clusters
2251 for netuuid in node_db.get_networks():
2252 net = node_db.lookup(netuuid)
2254 debug("add_local", netuuid)
2255 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2257 if acceptors.has_key(srv.port):
2258 panic("duplicate port:", srv.port)
2259 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
2261 # This node is a gateway.
2263 def node_is_router():
2266 # If there are any routers found in the config, then this will be true
2267 # and all nodes will load kptlrouter.
2269 def node_needs_router():
2270 return needs_router or is_router
2272 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2273 # Currently, these local routes are only added to kptlrouter route
2274 # table if they are needed to connect to a specific server. This
2275 # should be changed so all available routes are loaded, and the
2276 # ptlrouter can make all the decisions.
2279 def find_local_routes(lustre):
2280 """ Scan the lustre config looking for routers . Build list of
2282 global local_routes, needs_router
2284 list = lustre.lookup_class('node')
2286 if router.get_val_int('router', 0):
2288 for (local_type, local_cluster_id, local_nid) in local_clusters:
2290 for netuuid in router.get_networks():
2291 db = router.lookup(netuuid)
2292 if (local_type == db.get_val('nettype') and
2293 local_cluster_id == db.get_val('clusterid')):
2294 gw = db.get_val('nid')
2297 debug("find_local_routes: gw is", gw)
2298 for route in router.get_local_routes(local_type, gw):
2299 local_routes.append(route)
2300 debug("find_local_routes:", local_routes)
2303 def choose_local_server(srv_list):
2304 for srv in srv_list:
2305 if local_cluster(srv.net_type, srv.cluster_id):
2308 def local_cluster(net_type, cluster_id):
2309 for cluster in local_clusters:
2310 if net_type == cluster[0] and cluster_id == cluster[1]:
2314 def local_interface(net_type, cluster_id, nid):
2315 for cluster in local_clusters:
2316 if (net_type == cluster[0] and cluster_id == cluster[1]
2317 and nid == cluster[2]):
2321 def find_route(srv_list):
2323 frm_type = local_clusters[0][0]
2324 for srv in srv_list:
2325 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2326 to_type = srv.net_type
2328 cluster_id = srv.cluster_id
2329 debug ('looking for route to', to_type, to)
2330 for r in local_routes:
2331 debug("find_route: ", r)
2332 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2333 result.append((srv, r))
2336 def get_active_target(db):
2337 target_uuid = db.getUUID()
2338 target_name = db.getName()
2339 node_name = get_select(target_name)
2341 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2343 tgt_dev_uuid = db.get_first_ref('active')
2346 def get_server_by_nid_uuid(db, nid_uuid):
2347 for n in db.lookup_class("network"):
2349 if net.nid_uuid == nid_uuid:
2353 ############################################################
2357 type = db.get_class()
2358 debug('Service:', type, db.getName(), db.getUUID())
2363 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2364 elif type == 'network':
2366 elif type == 'routetbl':
2370 elif type == 'cobd':
2372 elif type == 'mdsdev':
2374 elif type == 'mountpoint':
2376 elif type == 'echoclient':
2378 elif type == 'mgmt':
2381 panic ("unknown service type:", type)
2385 # Prepare the system to run lustre using a particular profile
2386 # in a the configuration.
2387 # * load & the modules
2388 # * setup networking for the current node
2389 # * make sure partitions are in place and prepared
2390 # * initialize devices with lctl
2391 # Levels is important, and needs to be enforced.
2392 def for_each_profile(db, prof_list, operation):
2393 for prof_uuid in prof_list:
2394 prof_db = db.lookup(prof_uuid)
2396 panic("profile:", prof_uuid, "not found.")
2397 services = getServices(prof_db)
2400 def doWriteconf(services):
2404 if s[1].get_class() == 'mdsdev':
2405 n = newService(s[1])
2408 def doSetup(services):
2412 n = newService(s[1])
2415 def doModules(services):
2419 n = newService(s[1])
2422 def doCleanup(services):
2427 n = newService(s[1])
2428 if n.safe_to_clean():
2431 def doUnloadModules(services):
2436 n = newService(s[1])
2437 if n.safe_to_clean_modules():
2442 def doHost(lustreDB, hosts):
2443 global is_router, local_node_name
2446 node_db = lustreDB.lookup_name(h, 'node')
2450 panic('No host entry found.')
2452 local_node_name = node_db.get_val('name', 0)
2453 is_router = node_db.get_val_int('router', 0)
2454 lustre_upcall = node_db.get_val('lustreUpcall', '')
2455 portals_upcall = node_db.get_val('portalsUpcall', '')
2456 timeout = node_db.get_val_int('timeout', 0)
2457 ptldebug = node_db.get_val('ptldebug', '')
2458 subsystem = node_db.get_val('subsystem', '')
2460 find_local_clusters(node_db)
2462 find_local_routes(lustreDB)
2464 # Two step process: (1) load modules, (2) setup lustre
2465 # if not cleaning, load modules first.
2466 prof_list = node_db.get_refs('profile')
2468 if config.write_conf:
2469 for_each_profile(node_db, prof_list, doModules)
2471 for_each_profile(node_db, prof_list, doWriteconf)
2472 for_each_profile(node_db, prof_list, doUnloadModules)
2475 elif config.recover:
2476 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2477 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2478 "--client_uuid <UUID> --conn_uuid <UUID>")
2479 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2481 elif config.cleanup:
2483 # the command line can override this value
2485 # ugly hack, only need to run lctl commands for --dump
2486 if config.lctl_dump or config.record:
2487 for_each_profile(node_db, prof_list, doCleanup)
2490 sys_set_timeout(timeout)
2491 sys_set_ptldebug(ptldebug)
2492 sys_set_subsystem(subsystem)
2493 sys_set_lustre_upcall(lustre_upcall)
2494 sys_set_portals_upcall(portals_upcall)
2496 for_each_profile(node_db, prof_list, doCleanup)
2497 for_each_profile(node_db, prof_list, doUnloadModules)
2501 # ugly hack, only need to run lctl commands for --dump
2502 if config.lctl_dump or config.record:
2503 sys_set_timeout(timeout)
2504 sys_set_lustre_upcall(lustre_upcall)
2505 for_each_profile(node_db, prof_list, doSetup)
2509 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2510 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2512 for_each_profile(node_db, prof_list, doModules)
2514 sys_set_debug_path()
2515 sys_set_ptldebug(ptldebug)
2516 sys_set_subsystem(subsystem)
2517 script = config.gdb_script
2518 run(lctl.lctl, ' modules >', script)
2520 log ("The GDB module script is in", script)
2521 # pause, so user has time to break and
2524 sys_set_timeout(timeout)
2525 sys_set_lustre_upcall(lustre_upcall)
2526 sys_set_portals_upcall(portals_upcall)
2528 for_each_profile(node_db, prof_list, doSetup)
2531 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2532 tgt = lustreDB.lookup(tgt_uuid)
2534 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2535 new_uuid = get_active_target(tgt)
2537 raise Lustre.LconfError("doRecovery: no active target found for: " +
2539 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2541 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2543 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2545 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2548 lctl.disconnect(oldnet)
2549 except CommandError, e:
2550 log("recover: disconnect", nid_uuid, "failed: ")
2555 except CommandError, e:
2556 log("recover: connect failed")
2559 lctl.recover(client_uuid, net.nid_uuid)
2562 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2563 base = os.path.dirname(cmd)
2564 if development_mode():
2565 if not config.lustre:
2566 debug('using objdir module paths')
2567 config.lustre = (os.path.join(base, ".."))
2568 # normalize the portals dir, using command line arg if set
2570 portals_dir = config.portals
2571 dir = os.path.join(config.lustre, portals_dir)
2572 config.portals = dir
2573 debug('config.portals', config.portals)
2574 elif config.lustre and config.portals:
2576 # if --lustre and --portals, normalize portals
2577 # can ignore POTRALS_DIR here, since it is probly useless here
2578 config.portals = os.path.join(config.lustre, config.portals)
2579 debug('config.portals B', config.portals)
2581 def sysctl(path, val):
2582 debug("+ sysctl", path, val)
2586 fp = open(os.path.join('/proc/sys', path), 'w')
2593 def sys_set_debug_path():
2594 sysctl('portals/debug_path', config.debug_path)
2596 def sys_set_lustre_upcall(upcall):
2597 # the command overrides the value in the node config
2598 if config.lustre_upcall:
2599 upcall = config.lustre_upcall
2601 upcall = config.upcall
2603 lctl.set_lustre_upcall(upcall)
2605 def sys_set_portals_upcall(upcall):
2606 # the command overrides the value in the node config
2607 if config.portals_upcall:
2608 upcall = config.portals_upcall
2610 upcall = config.upcall
2612 sysctl('portals/upcall', upcall)
2614 def sys_set_timeout(timeout):
2615 # the command overrides the value in the node config
2616 if config.timeout and config.timeout > 0:
2617 timeout = config.timeout
2618 if timeout != None and timeout > 0:
2619 lctl.set_timeout(timeout)
2621 def sys_tweak_socknal ():
2622 if config.single_socket:
2623 sysctl("socknal/typed", 0)
2625 def sys_optimize_elan ():
2626 procfiles = ["/proc/elan/config/eventint_punt_loops",
2627 "/proc/qsnet/elan3/config/eventint_punt_loops",
2628 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2630 if os.access(p, os.W_OK):
2631 run ("echo 1 > " + p)
2633 def sys_set_ptldebug(ptldebug):
2635 ptldebug = config.ptldebug
2638 val = eval(ptldebug, ptldebug_names)
2639 val = "0x%x" % (val)
2640 sysctl('portals/debug', val)
2641 except NameError, e:
2644 def sys_set_subsystem(subsystem):
2645 if config.subsystem:
2646 subsystem = config.subsystem
2649 val = eval(subsystem, subsystem_names)
2650 val = "0x%x" % (val)
2651 sysctl('portals/subsystem_debug', val)
2652 except NameError, e:
2655 def sys_set_netmem_max(path, max):
2656 debug("setting", path, "to at least", max)
2664 fp = open(path, 'w')
2665 fp.write('%d\n' %(max))
2669 def sys_make_devices():
2670 if not os.access('/dev/portals', os.R_OK):
2671 run('mknod /dev/portals c 10 240')
2672 if not os.access('/dev/obd', os.R_OK):
2673 run('mknod /dev/obd c 10 241')
2676 # Add dir to the global PATH, if not already there.
2677 def add_to_path(new_dir):
2678 syspath = string.split(os.environ['PATH'], ':')
2679 if new_dir in syspath:
2681 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2683 def default_debug_path():
2684 path = '/tmp/lustre-log'
2685 if os.path.isdir('/r'):
2690 def default_gdb_script():
2691 script = '/tmp/ogdb'
2692 if os.path.isdir('/r'):
2693 return '/r' + script
2698 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2699 # ensure basic elements are in the system path
2700 def sanitise_path():
2701 for dir in DEFAULT_PATH:
2704 # global hack for the --select handling
2706 def init_select(args):
2707 # args = [service=nodeA,service2=nodeB service3=nodeC]
2710 list = string.split(arg, ',')
2712 srv, node = string.split(entry, '=')
2713 tgt_select[srv] = node
2715 def get_select(srv):
2716 if tgt_select.has_key(srv):
2717 return tgt_select[srv]
2721 FLAG = Lustre.Options.FLAG
2722 PARAM = Lustre.Options.PARAM
2723 INTPARAM = Lustre.Options.INTPARAM
2724 PARAMLIST = Lustre.Options.PARAMLIST
2726 ('verbose,v', "Print system commands as they are run"),
2727 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2728 ('config', "Cluster config name used for LDAP query", PARAM),
2729 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2730 ('node', "Load config for <nodename>", PARAM),
2731 ('cleanup,d', "Cleans up config. (Shutdown)"),
2732 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2734 ('single_socket', "socknal option: only use one socket instead of bundle",
2736 ('failover',"""Used to shut down without saving state.
2737 This will allow this node to "give up" a service to a
2738 another node for failover purposes. This will not
2739 be a clean shutdown.""",
2741 ('gdb', """Prints message after creating gdb module script
2742 and sleeps for 5 seconds."""),
2743 ('noexec,n', """Prints the commands and steps that will be run for a
2744 config without executing them. This can used to check if a
2745 config file is doing what it should be doing"""),
2746 ('nomod', "Skip load/unload module step."),
2747 ('nosetup', "Skip device setup/cleanup step."),
2748 ('reformat', "Reformat all devices (without question)"),
2749 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2750 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2751 ('clientoptions', "Additional options for Lustre", PARAM),
2752 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2754 ('write_conf', "Save all the client config information on mds."),
2755 ('record', "Write config information on mds."),
2756 ('record_log', "Name of config record log.", PARAM),
2757 ('record_device', "MDS device name that will record the config commands",
2759 ('minlevel', "Minimum level of services to configure/cleanup",
2761 ('maxlevel', """Maximum level of services to configure/cleanup
2762 Levels are aproximatly like:
2767 70 - mountpoint, echo_client, osc, mdc, lov""",
2769 ('lustre', """Base directory of lustre sources. This parameter will
2770 cause lconf to load modules from a source tree.""", PARAM),
2771 ('portals', """Portals source directory. If this is a relative path,
2772 then it is assumed to be relative to lustre. """, PARAM),
2773 ('timeout', "Set recovery timeout", INTPARAM),
2774 ('upcall', "Set both portals and lustre upcall script", PARAM),
2775 ('lustre_upcall', "Set lustre upcall script", PARAM),
2776 ('portals_upcall', "Set portals upcall script", PARAM),
2777 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2778 ('ptldebug', "Set the portals debug level", PARAM),
2779 ('subsystem', "Set the portals debug subsystem", PARAM),
2780 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2781 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2782 # Client recovery options
2783 ('recover', "Recover a device"),
2784 ('group', "The group of devices to configure or cleanup", PARAM),
2785 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2786 ('client_uuid', "The failed client (required for recovery)", PARAM),
2787 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2789 ('inactive', """The name of an inactive service, to be ignored during
2790 mounting (currently OST-only). Can be repeated.""",
2795 global lctl, config, toplustreDB, CONFIG_FILE
2797 # in the upcall this is set to SIG_IGN
2798 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2800 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2802 config, args = cl.parse(sys.argv[1:])
2803 except Lustre.OptionError, e:
2807 setupModulePath(sys.argv[0])
2809 host = socket.gethostname()
2811 # the PRNG is normally seeded with time(), which is not so good for starting
2812 # time-synchronized clusters
2813 input = open('/dev/urandom', 'r')
2815 print 'Unable to open /dev/urandom!'
2817 seed = input.read(32)
2823 init_select(config.select)
2826 # allow config to be fetched via HTTP, but only with python2
2827 if sys.version[0] != '1' and args[0].startswith('http://'):
2830 config_file = urllib2.urlopen(args[0])
2831 except (urllib2.URLError, socket.error), err:
2832 if hasattr(err, 'args'):
2834 print "Could not access '%s': %s" %(args[0], err)
2836 elif not os.access(args[0], os.R_OK):
2837 print 'File not found or readable:', args[0]
2841 config_file = open(args[0], 'r')
2843 dom = xml.dom.minidom.parse(config_file)
2845 panic("%s does not appear to be a config file." % (args[0]))
2846 sys.exit(1) # make sure to die here, even in debug mode.
2848 CONFIG_FILE = args[0]
2849 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2850 if not config.config:
2851 config.config = os.path.basename(args[0])# use full path?
2852 if config.config[-4:] == '.xml':
2853 config.config = config.config[:-4]
2854 elif config.ldapurl:
2855 if not config.config:
2856 panic("--ldapurl requires --config name")
2857 dn = "config=%s,fs=lustre" % (config.config)
2858 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2859 elif config.ptldebug or config.subsystem:
2860 sys_set_ptldebug(None)
2861 sys_set_subsystem(None)
2864 print 'Missing config file or ldap URL.'
2865 print 'see lconf --help for command summary'
2868 toplustreDB = lustreDB
2870 ver = lustreDB.get_version()
2872 panic("No version found in config data, please recreate.")
2873 if ver != Lustre.CONFIG_VERSION:
2874 panic("Config version", ver, "does not match lconf version",
2875 Lustre.CONFIG_VERSION)
2879 node_list.append(config.node)
2882 node_list.append(host)
2883 node_list.append('localhost')
2885 debug("configuring for host: ", node_list)
2888 config.debug_path = config.debug_path + '-' + host
2889 config.gdb_script = config.gdb_script + '-' + host
2891 lctl = LCTLInterface('lctl')
2893 if config.lctl_dump:
2894 lctl.use_save_file(config.lctl_dump)
2897 if not (config.record_device and config.record_log):
2898 panic("When recording, both --record_log and --record_device must be specified.")
2899 lctl.clear_log(config.record_device, config.record_log)
2900 lctl.record(config.record_device, config.record_log)
2902 doHost(lustreDB, node_list)
2907 if __name__ == "__main__":
2910 except Lustre.LconfError, e:
2912 # traceback.print_exc(file=sys.stdout)
2914 except CommandError, e:
2918 if first_cleanup_error:
2919 sys.exit(first_cleanup_error)