3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = ["/usr/lib/lustre/python", "/usr/lib64/lustre/python"]
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.extend(PYMOD_DIR)
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = '../portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
91 "console" : (1 << 25),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
107 "pinger" : (1 << 12),
108 "filter" : (1 << 13),
113 "ptlrouter" : (1 << 18),
117 "confobd" : (1 << 22),
123 first_cleanup_error = 0
124 def cleanup_error(rc):
125 global first_cleanup_error
126 if not first_cleanup_error:
127 first_cleanup_error = rc
129 # ============================================================
130 # debugging and error funcs
132 def fixme(msg = "this feature"):
133 raise Lustre.LconfError, msg + ' not implemented yet.'
136 msg = string.join(map(str,args))
137 if not config.noexec:
138 raise Lustre.LconfError(msg)
143 msg = string.join(map(str,args))
148 print string.strip(s)
152 msg = string.join(map(str,args))
155 # ack, python's builtin int() does not support '0x123' syntax.
156 # eval can do it, although what a hack!
159 if type(s) is types.IntType:
162 if (s[0:2] == '0x') or (s[0:1] == '0'):
163 return eval(s, {}, {})
166 except SyntaxError, e:
167 raise ValueError("not a number")
169 raise ValueError("not a number")
171 raise ValueError("not a number")
173 # ============================================================
174 # locally defined exceptions
175 class CommandError (exceptions.Exception):
176 def __init__(self, cmd_name, cmd_err, rc=None):
177 self.cmd_name = cmd_name
178 self.cmd_err = cmd_err
183 if type(self.cmd_err) == types.StringType:
185 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
187 print "! %s: %s" % (self.cmd_name, self.cmd_err)
188 elif type(self.cmd_err) == types.ListType:
190 print "! %s (error %d):" % (self.cmd_name, self.rc)
192 print "! %s:" % (self.cmd_name)
193 for s in self.cmd_err:
194 print "> %s" %(string.strip(s))
199 # ============================================================
200 # handle daemons, like the acceptor
202 """ Manage starting and stopping a daemon. Assumes daemon manages
203 it's own pid file. """
205 def __init__(self, cmd):
211 log(self.command, "already running.")
213 self.path = find_prog(self.command)
215 panic(self.command, "not found.")
216 ret, out = runcmd(self.path +' '+ self.command_line())
218 # wait for up to 15 seconds checking to see if a competing daemon
219 # starts successfully
221 while (not self.running()) and (loop_count > 0):
222 loop_count = loop_count - 1
225 if not self.running():
226 raise CommandError(self.path, out, ret)
230 pid = self.read_pidfile()
234 log ("killing process", pid)
236 #time.sleep(1) # let daemon die
238 log("unable to kill", self.command, e)
240 # wait for the dameon to die for up to 15 seconds
241 # before complaining about it
243 while self.running() and (self.read_pidfile == pid) and (loop_count > 0):
244 loop_count = loop_count - 1
246 if self.running() and (self.read_pidfile == pid):
247 log("unable to kill", self.command, "process", pid)
250 pid = self.read_pidfile()
260 def read_pidfile(self):
262 fp = open(self.pidfile(), 'r')
267 print "WARNING: invalid pid in %s, removed" % self.pidfile()
268 print "WARNING: You may need to stop acceptor by yourself"
269 os.unlink(self.pidfile())
274 def clean_pidfile(self):
275 """ Remove a stale pidfile """
276 log("removing stale pidfile:", self.pidfile())
278 os.unlink(self.pidfile())
280 log(self.pidfile(), e)
282 class AcceptorHandler(DaemonHandler):
283 def __init__(self, port, net_type):
284 DaemonHandler.__init__(self, "acceptor")
286 self.net_type = net_type
290 return "/var/run/%s-%d.pid" % (self.command, self.port)
292 def command_line(self):
293 return string.join(map(str,(self.flags, self.port)))
297 # start the acceptors
299 if config.lctl_dump or config.record:
301 for port in acceptors.keys():
302 daemon = acceptors[port]
303 if daemon.net_type == 'tcp' and not daemon.running():
306 def run_one_acceptor(port):
307 if config.lctl_dump or config.record:
309 if acceptors.has_key(port):
310 daemon = acceptors[port]
311 if daemon.net_type == 'tcp' and not daemon.running():
314 panic("run_one_acceptor: No acceptor defined for port:", port)
316 def stop_acceptor(port):
317 if acceptors.has_key(port):
318 daemon = acceptors[port]
319 if daemon.net_type == 'tcp' and daemon.running():
323 # ============================================================
324 # handle lctl interface
327 Manage communication with lctl
330 def __init__(self, cmd):
332 Initialize close by finding the lctl binary.
334 self.lctl = find_prog(cmd)
336 self.record_device = ''
339 debug('! lctl not found')
342 raise CommandError('lctl', "unable to find lctl binary.")
344 def use_save_file(self, file):
345 self.save_file = file
347 def record(self, dev_name, logname):
348 log("Recording log", logname, "on", dev_name)
349 self.record_device = dev_name
350 self.record_log = logname
352 def end_record(self):
353 log("End recording log", self.record_log, "on", self.record_device)
354 self.record_device = None
355 self.record_log = None
357 def set_nonblock(self, fd):
358 fl = fcntl.fcntl(fd, F_GETFL)
359 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
364 the cmds are written to stdin of lctl
365 lctl doesn't return errors when run in script mode, so
367 should modify command line to accept multiple commands, or
368 create complex command line options
372 cmds = '\n dump ' + self.save_file + '\n' + cmds
373 elif self.record_device:
377 %s""" % (self.record_device, self.record_log, cmds)
379 debug("+", cmd_line, cmds)
380 if config.noexec: return (0, [])
382 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
383 child.tochild.write(cmds + "\n")
384 child.tochild.close()
386 # From "Python Cookbook" from O'Reilly
387 outfile = child.fromchild
388 outfd = outfile.fileno()
389 self.set_nonblock(outfd)
390 errfile = child.childerr
391 errfd = errfile.fileno()
392 self.set_nonblock(errfd)
394 outdata = errdata = ''
397 ready = select.select([outfd,errfd],[],[]) # Wait for input
398 if outfd in ready[0]:
399 outchunk = outfile.read()
400 if outchunk == '': outeof = 1
401 outdata = outdata + outchunk
402 if errfd in ready[0]:
403 errchunk = errfile.read()
404 if errchunk == '': erreof = 1
405 errdata = errdata + errchunk
406 if outeof and erreof: break
407 # end of "borrowed" code
410 if os.WIFEXITED(ret):
411 rc = os.WEXITSTATUS(ret)
414 if rc or len(errdata):
415 raise CommandError(self.lctl, errdata, rc)
418 def runcmd(self, *args):
420 run lctl using the command line
422 cmd = string.join(map(str,args))
423 debug("+", self.lctl, cmd)
424 rc, out = run(self.lctl, cmd)
426 raise CommandError(self.lctl, out, rc)
430 def clear_log(self, dev, log):
431 """ clear an existing log """
436 quit """ % (dev, log)
439 def network(self, net, nid):
444 quit """ % (net, nid)
448 def add_interface(self, net, ip, netmask = ""):
449 """ add an interface """
453 quit """ % (net, ip, netmask)
456 # delete an interface
457 def del_interface(self, net, ip):
458 """ delete an interface """
465 # create a new connection
466 def add_uuid(self, net_type, uuid, nid):
467 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
470 def add_peer(self, net_type, nid, hostaddr, port):
471 if net_type in ('tcp','openib','ra') and not config.lctl_dump:
476 nid, hostaddr, port )
478 elif net_type in ('iib',) and not config.lctl_dump:
485 elif net_type in ('vib',) and not config.lctl_dump:
493 def connect(self, srv):
494 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
495 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
497 hostaddr = string.split(srv.hostaddr[0], '/')[0]
498 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
501 def recover(self, dev_name, new_conn):
504 recover %s""" %(dev_name, new_conn)
507 # add a route to a range
508 def add_route(self, net, gw, lo, hi):
516 except CommandError, e:
520 def del_route(self, net, gw, lo, hi):
525 quit """ % (net, gw, lo, hi)
528 # add a route to a host
529 def add_route_host(self, net, uuid, gw, tgt):
530 self.add_uuid(net, uuid, tgt)
538 except CommandError, e:
542 # add a route to a range
543 def del_route_host(self, net, uuid, gw, tgt):
549 quit """ % (net, gw, tgt)
553 def del_peer(self, net_type, nid, hostaddr):
554 if net_type in ('tcp',) and not config.lctl_dump:
558 del_peer %s %s single_share
562 elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump:
566 del_peer %s single_share
571 # disconnect one connection
572 def disconnect(self, srv):
573 self.del_uuid(srv.nid_uuid)
574 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
576 hostaddr = string.split(srv.hostaddr[0], '/')[0]
577 self.del_peer(srv.net_type, srv.nid, hostaddr)
579 def del_uuid(self, uuid):
587 def disconnectAll(self, net):
595 def attach(self, type, name, uuid):
598 quit""" % (type, name, uuid)
601 def setup(self, name, setup = ""):
605 quit""" % (name, setup)
609 # create a new device with lctl
610 def newdev(self, type, name, uuid, setup = ""):
611 self.attach(type, name, uuid);
613 self.setup(name, setup)
614 except CommandError, e:
615 self.cleanup(name, uuid, 0)
620 def cleanup(self, name, uuid, force, failover = 0):
621 if failover: force = 1
627 quit""" % (name, ('', 'force')[force],
628 ('', 'failover')[failover])
632 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
633 stripe_sz, stripe_off,
637 lov_setup %s %d %d %d %s %s
638 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
643 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
647 lov_setconfig %s %d %d %d %s %s
648 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
652 def dump(self, dump_file):
655 quit""" % (dump_file)
658 # get list of devices
659 def device_list(self):
660 devices = '/proc/fs/lustre/devices'
662 if os.access(devices, os.R_OK):
664 fp = open(devices, 'r')
672 def lustre_version(self):
673 rc, out = self.runcmd('version')
677 def mount_option(self, profile, osc, mdc):
679 mount_option %s %s %s
680 quit""" % (profile, osc, mdc)
683 # delete mount options
684 def del_mount_option(self, profile):
690 def set_timeout(self, timeout):
697 def set_lustre_upcall(self, upcall):
702 # ============================================================
703 # Various system-level functions
704 # (ideally moved to their own module)
706 # Run a command and return the output and status.
707 # stderr is sent to /dev/null, could use popen3 to
708 # save it if necessary
711 if config.noexec: return (0, [])
712 f = os.popen(cmd + ' 2>&1')
722 cmd = string.join(map(str,args))
725 # Run a command in the background.
726 def run_daemon(*args):
727 cmd = string.join(map(str,args))
729 if config.noexec: return 0
730 f = os.popen(cmd + ' 2>&1')
738 # Determine full path to use for an external command
739 # searches dirname(argv[0]) first, then PATH
741 syspath = string.split(os.environ['PATH'], ':')
742 cmdpath = os.path.dirname(sys.argv[0])
743 syspath.insert(0, cmdpath);
745 syspath.insert(0, os.path.join(config.portals, 'utils/'))
747 prog = os.path.join(d,cmd)
748 if os.access(prog, os.X_OK):
752 # Recursively look for file starting at base dir
753 def do_find_file(base, mod):
754 fullname = os.path.join(base, mod)
755 if os.access(fullname, os.R_OK):
757 for d in os.listdir(base):
758 dir = os.path.join(base,d)
759 if os.path.isdir(dir):
760 module = do_find_file(dir, mod)
764 def find_module(src_dir, dev_dir, modname):
765 modbase = src_dir +'/'+ dev_dir +'/'+ modname
766 for modext in '.ko', '.o':
767 module = modbase + modext
769 if os.access(module, os.R_OK):
775 # is the path a block device?
782 return stat.S_ISBLK(s[stat.ST_MODE])
784 # build fs according to type
786 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
792 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
794 # devsize is in 1k, and fs block count is in 4k
795 block_cnt = devsize/4
797 if fstype in ('ext3', 'extN', 'ldiskfs'):
798 # ext3 journal size is in megabytes
801 if not is_block(dev):
802 ret, out = runcmd("ls -l %s" %dev)
803 devsize = int(string.split(out[0])[4]) / 1024
805 # sfdisk works for symlink, hardlink, and realdev
806 ret, out = runcmd("sfdisk -s %s" %dev)
808 devsize = int(out[0])
810 # sfdisk -s will fail for too large block device,
811 # then, read the size of partition from /proc/partitions
813 # get the realpath of the device
814 # it may be the real device, such as /dev/hda7
815 # or the hardlink created via mknod for a device
816 if 'realpath' in dir(os.path):
817 real_dev = os.path.realpath(dev)
821 while os.path.islink(real_dev) and (link_count < 20):
822 link_count = link_count + 1
823 dev_link = os.readlink(real_dev)
824 if os.path.isabs(dev_link):
827 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
829 panic("Encountered too many symbolic links resolving block device:", dev)
831 # get the major and minor number of the realpath via ls
832 # it seems python(os.stat) does not return
833 # the st_rdev member of the stat structure
834 ret, out = runcmd("ls -l %s" %real_dev)
835 major = string.split(string.split(out[0])[4], ",")[0]
836 minor = string.split(out[0])[5]
838 # get the devsize from /proc/partitions with the major and minor number
839 ret, out = runcmd("cat /proc/partitions")
842 if string.split(line)[0] == major and string.split(line)[1] == minor:
843 devsize = int(string.split(line)[2])
846 if devsize > 1024 * 1024:
847 jsize = ((devsize / 102400) * 4)
850 if jsize: jopt = "-J size=%d" %(jsize,)
851 if isize: iopt = "-I %d" %(isize,)
852 mkfs = 'mkfs.ext2 -j -b 4096 '
853 if not isblock or config.force:
855 elif fstype == 'reiserfs':
856 # reiserfs journal size is in blocks
857 if jsize: jopt = "--journal_size %d" %(jsize,)
858 mkfs = 'mkreiserfs -ff'
860 panic('unsupported fs type: ', fstype)
862 if config.mkfsoptions != None:
863 mkfs = mkfs + ' ' + config.mkfsoptions
864 if mkfsoptions != None:
865 mkfs = mkfs + ' ' + mkfsoptions
866 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
868 panic("Unable to build fs:", dev, string.join(out))
869 # enable hash tree indexing on fsswe
870 if fstype in ('ext3', 'extN', 'ldiskfs'):
871 htree = 'echo "feature FEATURE_C5" | debugfs -w'
872 (ret, out) = run (htree, dev)
874 panic("Unable to enable htree:", dev)
876 # some systems use /dev/loopN, some /dev/loop/N
880 if not os.access(loop + str(0), os.R_OK):
882 if not os.access(loop + str(0), os.R_OK):
883 panic("can't access loop devices")
886 # find loop device assigned to thefile
889 for n in xrange(0, MAX_LOOP_DEVICES):
891 if os.access(dev, os.R_OK):
892 (stat, out) = run('losetup', dev)
893 if out and stat == 0:
894 m = re.search(r'\((.*)\)', out[0])
895 if m and file == m.group(1):
901 # create file if necessary and assign the first free loop device
902 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
903 dev = find_loop(file)
905 print 'WARNING file:', file, 'already mapped to', dev
907 if reformat or not os.access(file, os.R_OK | os.W_OK):
909 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
910 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
913 panic("Unable to create backing store:", file)
914 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
917 # find next free loop
918 for n in xrange(0, MAX_LOOP_DEVICES):
920 if os.access(dev, os.R_OK):
921 (stat, out) = run('losetup', dev)
923 run('losetup', dev, file)
926 print "out of loop devices"
928 print "out of loop devices"
931 # undo loop assignment
932 def clean_loop(file):
933 dev = find_loop(file)
935 ret, out = run('losetup -d', dev)
937 log('unable to clean loop device:', dev, 'for file:', file)
940 # determine if dev is formatted as a <fstype> filesystem
941 def need_format(fstype, dev):
942 # FIXME don't know how to implement this
945 # initialize a block device if needed
946 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
947 inode_size, mkfsoptions):
948 if config.noexec: return dev
949 if not is_block(dev):
950 dev = init_loop(dev, size, fstype, journal_size, inode_size,
951 mkfsoptions, reformat)
952 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
953 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
956 # panic("device:", dev,
957 # "not prepared, and autoformat is not set.\n",
958 # "Rerun with --reformat option to format ALL filesystems")
963 """lookup IP address for an interface"""
964 rc, out = run("/sbin/ifconfig", iface)
967 addr = string.split(out[1])[1]
968 ip = string.split(addr, ':')[1]
971 def def_mount_options(fstype, target):
972 """returns deafult mount options for passed fstype and target (mds, ost)"""
973 if fstype == 'ext3' or fstype == 'ldiskfs':
974 mountfsoptions = "errors=remount-ro"
975 if target == 'ost' and sys_get_branch() == '2.4':
976 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
977 return mountfsoptions
980 def sys_get_elan_position_file():
981 procfiles = ["/proc/elan/device0/position",
982 "/proc/qsnet/elan4/device0/position",
983 "/proc/qsnet/elan3/device0/position"]
985 if os.access(p, os.R_OK):
989 def sys_get_local_nid(net_type, wildcard, cluster_id):
990 """Return the local nid."""
992 if sys_get_elan_position_file() and net_type == 'elan':
993 local = sys_get_local_address('elan', '*', cluster_id)
995 local = sys_get_local_address(net_type, wildcard, cluster_id)
998 def sys_get_local_address(net_type, wildcard, cluster_id):
999 """Return the local address for the network type."""
1001 if net_type in ('tcp','openib','iib','vib','ra'):
1003 iface, star = string.split(wildcard, ':')
1004 local = if2addr(iface)
1006 panic("unable to determine ip for:", wildcard)
1008 host = socket.gethostname()
1009 local = socket.gethostbyname(host)
1010 elif net_type == 'elan':
1011 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
1012 f = sys_get_elan_position_file()
1014 panic("unable to determine local Elan ID")
1017 lines = fp.readlines()
1021 if a[0] == 'NodeId':
1025 nid = my_int(cluster_id) + my_int(elan_id)
1026 local = "%d" % (nid)
1027 except ValueError, e:
1031 elif net_type == 'lo':
1032 fixme("automatic local address for loopback")
1033 elif net_type == 'gm':
1034 gmnalnid = '/usr/sbin/gmnalnid'
1035 if os.path.exists(gmnalnid) and os.access(gmnalnid, os.X_OK):
1036 (rc, local) = run(gmnalnid, "-l")
1038 panic (gmnalnid, " not found or not executable on node with GM networking")
1040 panic (gmnalnid, " failed")
1041 local=string.rstrip(local[0])
1043 fixme("automatic local address for net type %s" % net_type)
1047 def sys_get_branch():
1048 """Returns kernel release"""
1050 fp = open('/proc/sys/kernel/osrelease')
1051 lines = fp.readlines()
1055 version = string.split(l)
1056 a = string.split(version[0], '.')
1057 return a[0] + '.' + a[1]
1062 def mod_loaded(modname):
1063 """Check if a module is already loaded. Look in /proc/modules for it."""
1065 fp = open('/proc/modules')
1066 lines = fp.readlines()
1068 # please forgive my tired fingers for this one
1069 ret = filter(lambda word, mod=modname: word == mod,
1070 map(lambda line: string.split(line)[0], lines))
1072 except Exception, e:
1075 # XXX: instead of device_list, ask for $name and see what we get
1076 def is_prepared(name):
1077 """Return true if a device exists for the name"""
1078 if config.lctl_dump:
1080 if (config.noexec or config.record) and config.cleanup:
1083 # expect this format:
1084 # 1 UP ldlm ldlm ldlm_UUID 2
1085 out = lctl.device_list()
1087 if name == string.split(s)[3]:
1089 except CommandError, e:
1093 def is_network_prepared():
1094 """If the any device exists, then assume that all networking
1095 has been configured"""
1096 out = lctl.device_list()
1099 def fs_is_mounted(path):
1100 """Return true if path is a mounted lustre filesystem"""
1102 fp = open('/proc/mounts')
1103 lines = fp.readlines()
1107 if a[1] == path and a[2] == 'lustre_lite':
1115 """Manage kernel modules"""
1116 def __init__(self, lustre_dir, portals_dir):
1117 self.lustre_dir = lustre_dir
1118 self.portals_dir = portals_dir
1119 self.kmodule_list = []
1121 def add_portals_module(self, dev_dir, modname):
1122 """Append a module to list of modules to load."""
1123 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1125 def add_lustre_module(self, dev_dir, modname):
1126 """Append a module to list of modules to load."""
1127 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1129 def load_module(self):
1130 """Load all the modules in the list in the order they appear."""
1131 for src_dir, dev_dir, mod in self.kmodule_list:
1132 if mod_loaded(mod) and not config.noexec:
1134 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1136 module = find_module(src_dir, dev_dir, mod)
1138 panic('module not found:', mod)
1139 (rc, out) = run('/sbin/insmod', module)
1140 if rc and not mod_loaded(mod):
1141 raise CommandError('insmod', out, rc)
1143 (rc, out) = run('/sbin/modprobe', mod)
1144 if rc and not mod_loaded(mod):
1145 raise CommandError('modprobe', out, rc)
1147 def cleanup_module(self):
1148 """Unload the modules in the list in reverse order."""
1150 rev = self.kmodule_list[:] # make *copy* of list
1152 for src_dir, dev_dir, mod in rev:
1153 if not mod_loaded(mod) and not config.noexec:
1156 if mod == 'portals' and config.dump:
1157 lctl.dump(config.dump)
1158 log('unloading module:', mod)
1159 (rc, out) = run('/sbin/rmmod', mod)
1161 log('! unable to unload module:', mod)
1164 # ============================================================
1165 # Classes to prepare and cleanup the various objects
1168 """ Base class for the rest of the modules. The default cleanup method is
1169 defined here, as well as some utilitiy funcs.
1171 def __init__(self, module_name, db):
1173 self.module_name = module_name
1174 self.name = self.db.getName()
1175 self.uuid = self.db.getUUID()
1178 self.kmod = kmod(config.lustre, config.portals)
1180 def info(self, *args):
1181 msg = string.join(map(str,args))
1182 print self.module_name + ":", self.name, self.uuid, msg
1185 """ default cleanup, used for most modules """
1188 lctl.cleanup(self.name, self.uuid, config.force)
1189 except CommandError, e:
1190 log(self.module_name, "cleanup failed: ", self.name)
1194 def add_portals_module(self, dev_dir, modname):
1195 """Append a module to list of modules to load."""
1196 self.kmod.add_portals_module(dev_dir, modname)
1198 def add_lustre_module(self, dev_dir, modname):
1199 """Append a module to list of modules to load."""
1200 self.kmod.add_lustre_module(dev_dir, modname)
1202 def load_module(self):
1203 """Load all the modules in the list in the order they appear."""
1204 self.kmod.load_module()
1206 def cleanup_module(self):
1207 """Unload the modules in the list in reverse order."""
1208 if self.safe_to_clean():
1209 self.kmod.cleanup_module()
1211 def safe_to_clean(self):
1214 def safe_to_clean_modules(self):
1215 return self.safe_to_clean()
1217 class Network(Module):
1218 def __init__(self,db):
1219 Module.__init__(self, 'NETWORK', db)
1220 self.net_type = self.db.get_val('nettype')
1221 self.nid = self.db.get_val('nid', '*')
1222 self.cluster_id = self.db.get_val('clusterid', "0")
1223 self.port = self.db.get_val_int('port', 0)
1226 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1228 panic("unable to set nid for", self.net_type, self.nid, self.cluster_id)
1229 self.generic_nid = 1
1230 debug("nid:", self.nid)
1232 self.generic_nid = 0
1234 self.nid_uuid = self.nid_to_uuid(self.nid)
1236 self.hostaddr = self.db.get_hostaddr()
1237 if len(self.hostaddr) == 0:
1238 self.hostaddr.append(self.nid)
1239 if '*' in self.hostaddr[0]:
1240 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1241 if not self.hostaddr[0]:
1242 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1243 debug("hostaddr:", self.hostaddr[0])
1245 self.add_portals_module("libcfs", 'libcfs')
1246 self.add_portals_module("portals", 'portals')
1247 if node_needs_router():
1248 self.add_portals_module("router", 'kptlrouter')
1249 if self.net_type == 'tcp':
1250 self.add_portals_module("knals/socknal", 'ksocknal')
1251 if self.net_type == 'elan':
1252 self.add_portals_module("knals/qswnal", 'kqswnal')
1253 if self.net_type == 'gm':
1254 self.add_portals_module("knals/gmnal", 'kgmnal')
1255 if self.net_type == 'openib':
1256 self.add_portals_module("knals/openibnal", 'kopenibnal')
1257 if self.net_type == 'iib':
1258 self.add_portals_module("knals/iibnal", 'kiibnal')
1259 if self.net_type == 'vib':
1260 self.add_portals_module("knals/vibnal", 'kvibnal')
1261 if self.net_type == 'lo':
1262 self.add_portals_module("knals/lonal", 'klonal')
1263 if self.net_type == 'ra':
1264 self.add_portals_module("knals/ranal", 'kranal')
1266 def nid_to_uuid(self, nid):
1267 return "NID_%s_UUID" %(nid,)
1270 if is_network_prepared():
1272 self.info(self.net_type, self.nid, self.port)
1273 if not (config.record and self.generic_nid):
1274 lctl.network(self.net_type, self.nid)
1275 if self.net_type == 'tcp':
1277 for hostaddr in self.db.get_hostaddr():
1278 ip = string.split(hostaddr, '/')[0]
1279 if len(string.split(hostaddr, '/')) == 2:
1280 netmask = string.split(hostaddr, '/')[1]
1283 lctl.add_interface(self.net_type, ip, netmask)
1284 if self.net_type == 'elan':
1286 if self.net_type == 'openib':
1288 panic("no port set for", self.net_type, self.hostaddr[0])
1289 sysctl('/proc/sys/openibnal/port', self.port)
1290 if self.net_type == 'ra':
1292 panic("no port set for", self.net_type, self.hostaddr[0])
1293 sysctl('/proc/sys/ranal/port', self.port)
1294 if self.port and node_is_router():
1295 run_one_acceptor(self.port)
1296 self.connect_peer_gateways()
1298 def connect_peer_gateways(self):
1299 for router in self.db.lookup_class('node'):
1300 if router.get_val_int('router', 0):
1301 for netuuid in router.get_networks():
1302 net = self.db.lookup(netuuid)
1304 if (gw.cluster_id == self.cluster_id and
1305 gw.net_type == self.net_type):
1306 if gw.nid != self.nid:
1309 def disconnect_peer_gateways(self):
1310 for router in self.db.lookup_class('node'):
1311 if router.get_val_int('router', 0):
1312 for netuuid in router.get_networks():
1313 net = self.db.lookup(netuuid)
1315 if (gw.cluster_id == self.cluster_id and
1316 gw.net_type == self.net_type):
1317 if gw.nid != self.nid:
1320 except CommandError, e:
1321 print "disconnect failed: ", self.name
1325 def safe_to_clean(self):
1326 return not is_network_prepared()
1329 self.info(self.net_type, self.nid, self.port)
1331 stop_acceptor(self.port)
1332 if node_is_router():
1333 self.disconnect_peer_gateways()
1334 if self.net_type == 'tcp':
1335 for hostaddr in self.db.get_hostaddr():
1336 ip = string.split(hostaddr, '/')[0]
1337 lctl.del_interface(self.net_type, ip)
1339 class RouteTable(Module):
1340 def __init__(self,db):
1341 Module.__init__(self, 'ROUTES', db)
1343 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1345 # only setup connections for tcp, ib, and ra NALs
1347 if not net_type in ('tcp','openib','iib','vib','ra'):
1350 # connect to target if route is to single node and this node is the gw
1351 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1352 if not local_cluster(net_type, tgt_cluster_id):
1353 panic("target", lo, " not on the local cluster")
1354 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1355 # connect to gateway if this node is not the gw
1356 elif (local_cluster(net_type, gw_cluster_id)
1357 and not local_interface(net_type, gw_cluster_id, gw)):
1358 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1363 panic("no server for nid", lo)
1366 return Network(srvdb)
1369 if is_network_prepared():
1372 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1373 lctl.add_route(net_type, gw, lo, hi)
1374 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1378 def safe_to_clean(self):
1379 return not is_network_prepared()
1382 if is_network_prepared():
1383 # the network is still being used, don't clean it up
1385 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1386 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1389 lctl.disconnect(srv)
1390 except CommandError, e:
1391 print "disconnect failed: ", self.name
1396 lctl.del_route(net_type, gw, lo, hi)
1397 except CommandError, e:
1398 print "del_route failed: ", self.name
1402 class Management(Module):
1403 def __init__(self, db):
1404 Module.__init__(self, 'MGMT', db)
1405 self.add_lustre_module('lvfs', 'lvfs')
1406 self.add_lustre_module('obdclass', 'obdclass')
1407 self.add_lustre_module('ptlrpc', 'ptlrpc')
1408 self.add_lustre_module('mgmt', 'mgmt_svc')
1411 if is_prepared(self.name):
1414 lctl.newdev("mgmt", self.name, self.uuid)
1416 def safe_to_clean(self):
1420 if is_prepared(self.name):
1421 Module.cleanup(self)
1423 # This is only needed to load the modules; the LDLM device
1424 # is now created automatically.
1426 def __init__(self,db):
1427 Module.__init__(self, 'LDLM', db)
1428 self.add_lustre_module('lvfs', 'lvfs')
1429 self.add_lustre_module('obdclass', 'obdclass')
1430 self.add_lustre_module('ptlrpc', 'ptlrpc')
1439 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1440 Module.__init__(self, 'LOV', db)
1441 if name_override != None:
1442 self.name = "lov_%s" % name_override
1443 self.add_lustre_module('lov', 'lov')
1444 self.mds_uuid = self.db.get_first_ref('mds')
1445 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1446 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1447 self.pattern = self.db.get_val_int('stripepattern', 0)
1448 self.devlist = self.db.get_refs('obd')
1449 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1451 self.desc_uuid = self.uuid
1452 self.uuid = generate_client_uuid(self.name)
1453 self.fs_name = fs_name
1455 self.config_only = 1
1457 self.config_only = None
1458 mds= self.db.lookup(self.mds_uuid)
1459 self.mds_name = mds.getName()
1460 for obd_uuid in self.devlist:
1461 obd = self.db.lookup(obd_uuid)
1462 osc = get_osc(obd, self.uuid, fs_name)
1464 self.osclist.append(osc)
1466 panic('osc not found:', obd_uuid)
1469 if is_prepared(self.name):
1471 if self.config_only:
1472 panic("Can't prepare config_only LOV ", self.name)
1474 for osc in self.osclist:
1476 # Only ignore connect failures with --force, which
1477 # isn't implemented here yet.
1478 osc.prepare(ignore_connect_failure=0)
1479 except CommandError, e:
1480 print "Error preparing OSC %s\n" % osc.uuid
1482 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1483 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1484 lctl.lov_setup(self.name, self.uuid,
1485 self.desc_uuid, self.mds_name, self.stripe_cnt,
1486 self.stripe_sz, self.stripe_off, self.pattern,
1487 string.join(self.devlist))
1490 if is_prepared(self.name):
1491 Module.cleanup(self)
1492 if self.config_only:
1493 panic("Can't clean up config_only LOV ", self.name)
1494 for osc in self.osclist:
1497 def load_module(self):
1498 if self.config_only:
1499 panic("Can't load modules for config_only LOV ", self.name)
1500 for osc in self.osclist:
1503 Module.load_module(self)
1505 def cleanup_module(self):
1506 if self.config_only:
1507 panic("Can't cleanup modules for config_only LOV ", self.name)
1508 Module.cleanup_module(self)
1509 for osc in self.osclist:
1510 osc.cleanup_module()
1513 class MDSDEV(Module):
1514 def __init__(self,db):
1515 Module.__init__(self, 'MDSDEV', db)
1516 self.devpath = self.db.get_val('devpath','')
1517 self.size = self.db.get_val_int('devsize', 0)
1518 self.journal_size = self.db.get_val_int('journalsize', 0)
1519 self.fstype = self.db.get_val('fstype', '')
1520 self.nspath = self.db.get_val('nspath', '')
1521 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1522 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1523 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1524 target_uuid = self.db.get_first_ref('target')
1525 mds = self.db.lookup(target_uuid)
1526 self.name = mds.getName()
1527 self.filesystem_uuids = mds.get_refs('filesystem')
1528 # FIXME: if fstype not set, then determine based on kernel version
1529 self.format = self.db.get_val('autoformat', "no")
1530 if mds.get_val('failover', 0):
1531 self.failover_mds = 'f'
1533 self.failover_mds = 'n'
1534 active_uuid = get_active_target(mds)
1536 panic("No target device found:", target_uuid)
1537 if active_uuid == self.uuid:
1541 if self.active and config.group and config.group != mds.get_val('group'):
1544 self.inode_size = self.db.get_val_int('inodesize', 0)
1545 if self.inode_size == 0:
1546 # find the LOV for this MDS
1547 lovconfig_uuid = mds.get_first_ref('lovconfig')
1548 if not lovconfig_uuid:
1549 panic("No LOV config found for MDS ", mds.name)
1550 lovconfig = mds.lookup(lovconfig_uuid)
1551 lov_uuid = lovconfig.get_first_ref('lov')
1553 panic("No LOV found for lovconfig ", lovconfig.name)
1554 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1556 # default stripe count controls default inode_size
1557 if (lov.stripe_cnt > 0):
1558 stripe_count = lov.stripe_cnt
1560 stripe_count = len(lov.devlist)
1561 if stripe_count > 77:
1562 self.inode_size = 4096
1563 elif stripe_count > 34:
1564 self.inode_size = 2048
1565 elif stripe_count > 13:
1566 self.inode_size = 1024
1567 elif stripe_count > 2:
1568 self.inode_size = 512
1570 self.inode_size = 256
1572 self.target_dev_uuid = self.uuid
1573 self.uuid = target_uuid
1576 self.add_lustre_module('mdc', 'mdc')
1577 self.add_lustre_module('osc', 'osc')
1578 self.add_lustre_module('lov', 'lov')
1579 self.add_lustre_module('mds', 'mds')
1580 if self.fstype == 'ldiskfs':
1581 self.add_lustre_module('ldiskfs', 'ldiskfs')
1583 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1585 def load_module(self):
1587 Module.load_module(self)
1590 if is_prepared(self.name):
1593 debug(self.uuid, "not active")
1596 # run write_conf automatically, if --reformat used
1598 self.info(self.devpath, self.fstype, self.size, self.format)
1600 # never reformat here
1601 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1602 self.format, self.journal_size, self.inode_size,
1604 if not is_prepared('MDT'):
1605 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1607 mountfsoptions = def_mount_options(self.fstype, 'mds')
1609 if config.mountfsoptions:
1611 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1613 mountfsoptions = config.mountfsoptions
1614 if self.mountfsoptions:
1615 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1617 if self.mountfsoptions:
1619 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1621 mountfsoptions = self.mountfsoptions
1623 print 'MDS mount options: ' + mountfsoptions
1625 lctl.newdev("mds", self.name, self.uuid,
1626 setup ="%s %s %s %s" %(blkdev, self.fstype, self.name, mountfsoptions))
1627 except CommandError, e:
1629 panic("MDS is missing the config log. Need to run " +
1630 "lconf --write_conf.")
1634 def write_conf(self):
1635 if is_prepared(self.name):
1637 self.info(self.devpath, self.fstype, self.format)
1638 blkdev = block_dev(self.devpath, self.size, self.fstype,
1639 config.reformat, self.format, self.journal_size,
1640 self.inode_size, self.mkfsoptions)
1641 lctl.newdev("mds", self.name, self.uuid,
1642 setup ="%s %s" %(blkdev, self.fstype))
1644 # record logs for the MDS lov
1645 for uuid in self.filesystem_uuids:
1646 log("recording clients for filesystem:", uuid)
1647 fs = self.db.lookup(uuid)
1648 obd_uuid = fs.get_first_ref('obd')
1649 client_uuid = generate_client_uuid(self.name)
1650 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1653 lctl.clear_log(self.name, self.name)
1654 lctl.record(self.name, self.name)
1656 lctl.mount_option(self.name, client.get_name(), "")
1660 lctl.clear_log(self.name, self.name + '-clean')
1661 lctl.record(self.name, self.name + '-clean')
1663 lctl.del_mount_option(self.name)
1668 # record logs for each client
1670 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1672 config_options = CONFIG_FILE
1674 for node_db in self.db.lookup_class('node'):
1675 client_name = node_db.getName()
1676 for prof_uuid in node_db.get_refs('profile'):
1677 prof_db = node_db.lookup(prof_uuid)
1678 # refactor this into a funtion to test "clientness"
1680 for ref_class, ref_uuid in prof_db.get_all_refs():
1681 if ref_class in ('mountpoint','echoclient'):
1682 debug("recording", client_name)
1683 old_noexec = config.noexec
1685 noexec_opt = ('', '-n')
1686 ret, out = run (sys.argv[0],
1687 noexec_opt[old_noexec == 1],
1688 " -v --record --nomod",
1689 "--record_log", client_name,
1690 "--record_device", self.name,
1691 "--node", client_name,
1694 lctl.clear_log(self.name, client_name)
1697 panic("Record client log %s on %s failed" %(
1698 client_name, self.name))
1700 for s in out: log("record> ", string.strip(s))
1701 ret, out = run (sys.argv[0],
1702 noexec_opt[old_noexec == 1],
1703 "--cleanup -v --record --nomod",
1704 "--record_log", client_name + "-clean",
1705 "--record_device", self.name,
1706 "--node", client_name,
1709 # In this case, although 0-conf mount works but 0-conf umount
1710 # doesn't work. As a boring result, the user is forced to
1711 # cleanup client service manually again and again. So I prefer
1712 # deleting these two llogs together and let the user write_conf.
1713 lctl.clear_log(self.name, client_name)
1714 lctl.clear_log(self.name, client_name + '-clean')
1717 panic("Record client log %s on %s failed" %(
1718 client_name + '-clean', self.name))
1720 for s in out: log("record> ", string.strip(s))
1721 config.noexec = old_noexec
1723 lctl.cleanup(self.name, self.uuid, config.force, config.failover)
1724 except CommandError, e:
1725 log(self.module_name, "cleanup failed: ", self.name)
1728 Module.cleanup(self)
1729 clean_loop(self.devpath)
1731 def msd_remaining(self):
1732 out = lctl.device_list()
1734 if string.split(s)[2] in ('mds',):
1737 def safe_to_clean(self):
1740 def safe_to_clean_modules(self):
1741 return not self.msd_remaining()
1745 debug(self.uuid, "not active")
1748 if is_prepared(self.name):
1750 lctl.cleanup(self.name, self.uuid, config.force,
1752 except CommandError, e:
1753 log(self.module_name, "cleanup failed: ", self.name)
1756 Module.cleanup(self)
1757 if not self.msd_remaining() and is_prepared('MDT'):
1759 lctl.cleanup("MDT", "MDT_UUID", config.force,
1761 except CommandError, e:
1762 print "cleanup failed: ", self.name
1765 clean_loop(self.devpath)
1768 def __init__(self, db):
1769 Module.__init__(self, 'OSD', db)
1770 self.osdtype = self.db.get_val('osdtype')
1771 self.devpath = self.db.get_val('devpath', '')
1772 self.size = self.db.get_val_int('devsize', 0)
1773 self.journal_size = self.db.get_val_int('journalsize', 0)
1774 self.inode_size = self.db.get_val_int('inodesize', 0)
1775 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1776 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1777 self.fstype = self.db.get_val('fstype', '')
1778 self.nspath = self.db.get_val('nspath', '')
1779 target_uuid = self.db.get_first_ref('target')
1780 ost = self.db.lookup(target_uuid)
1781 self.name = ost.getName()
1782 self.format = self.db.get_val('autoformat', 'yes')
1783 if ost.get_val('failover', 0):
1784 self.failover_ost = 'f'
1786 self.failover_ost = 'n'
1788 active_uuid = get_active_target(ost)
1790 panic("No target device found:", target_uuid)
1791 if active_uuid == self.uuid:
1795 if self.active and config.group and config.group != ost.get_val('group'):
1798 self.target_dev_uuid = self.uuid
1799 self.uuid = target_uuid
1801 self.add_lustre_module('ost', 'ost')
1802 # FIXME: should we default to ext3 here?
1803 if self.fstype == 'ldiskfs':
1804 self.add_lustre_module('ldiskfs', 'ldiskfs')
1806 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1807 self.add_lustre_module(self.osdtype, self.osdtype)
1809 def load_module(self):
1811 Module.load_module(self)
1813 # need to check /proc/mounts and /etc/mtab before
1814 # formatting anything.
1815 # FIXME: check if device is already formatted.
1817 if is_prepared(self.name):
1820 debug(self.uuid, "not active")
1822 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1823 self.format, self.journal_size, self.inode_size)
1825 if self.osdtype == 'obdecho':
1828 blkdev = block_dev(self.devpath, self.size, self.fstype,
1829 config.reformat, self.format, self.journal_size,
1830 self.inode_size, self.mkfsoptions)
1832 mountfsoptions = def_mount_options(self.fstype, 'ost')
1834 if config.mountfsoptions:
1836 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1838 mountfsoptions = config.mountfsoptions
1839 if self.mountfsoptions:
1840 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1842 if self.mountfsoptions:
1844 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1846 mountfsoptions = self.mountfsoptions
1848 print 'OST mount options: ' + mountfsoptions
1850 lctl.newdev(self.osdtype, self.name, self.uuid,
1851 setup ="%s %s %s %s" %(blkdev, self.fstype,
1852 self.failover_ost, mountfsoptions))
1853 if not is_prepared('OSS'):
1854 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1856 def osd_remaining(self):
1857 out = lctl.device_list()
1859 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1862 def safe_to_clean(self):
1865 def safe_to_clean_modules(self):
1866 return not self.osd_remaining()
1870 debug(self.uuid, "not active")
1872 if is_prepared(self.name):
1875 lctl.cleanup(self.name, self.uuid, config.force,
1877 except CommandError, e:
1878 log(self.module_name, "cleanup failed: ", self.name)
1881 if not self.osd_remaining() and is_prepared('OSS'):
1883 lctl.cleanup("OSS", "OSS_UUID", config.force,
1885 except CommandError, e:
1886 print "cleanup failed: ", self.name
1889 if not self.osdtype == 'obdecho':
1890 clean_loop(self.devpath)
1892 def mgmt_uuid_for_fs(mtpt_name):
1895 mtpt_db = toplustreDB.lookup_name(mtpt_name)
1896 fs_uuid = mtpt_db.get_first_ref('filesystem')
1897 fs = toplustreDB.lookup(fs_uuid)
1900 return fs.get_first_ref('mgmt')
1902 # Generic client module, used by OSC and MDC
1903 class Client(Module):
1904 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1906 self.target_name = tgtdb.getName()
1907 self.target_uuid = tgtdb.getUUID()
1910 self.tgt_dev_uuid = get_active_target(tgtdb)
1911 if not self.tgt_dev_uuid:
1912 panic("No target device found for target:", self.target_name)
1914 self.kmod = kmod(config.lustre, config.portals)
1918 self.module = module
1919 self.module_name = string.upper(module)
1921 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1922 self.target_name, fs_name)
1924 self.name = self_name
1926 self.lookup_server(self.tgt_dev_uuid)
1927 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1929 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1932 self.fs_name = fs_name
1935 self.add_lustre_module(module_dir, module)
1937 def lookup_server(self, srv_uuid):
1938 """ Lookup a server's network information """
1939 self._server_nets = get_ost_net(self.db, srv_uuid)
1940 if len(self._server_nets) == 0:
1941 panic("Unable to find a server for:", srv_uuid)
1943 def get_servers(self):
1944 return self._server_nets
1946 def prepare(self, ignore_connect_failure = 0):
1947 self.info(self.target_uuid)
1948 if is_prepared(self.name):
1951 srv = choose_local_server(self.get_servers())
1955 routes = find_route(self.get_servers())
1956 if len(routes) == 0:
1957 panic("no route to", self.target_uuid)
1958 for (srv, r) in routes:
1959 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1960 except CommandError, e:
1961 if not ignore_connect_failure:
1965 if self.target_uuid in config.inactive and self.permits_inactive():
1966 debug("%s inactive" % self.target_uuid)
1967 inactive_p = "inactive"
1969 debug("%s active" % self.target_uuid)
1971 lctl.newdev(self.module, self.name, self.uuid,
1972 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1973 inactive_p, self.mgmt_name))
1976 if is_prepared(self.name):
1977 Module.cleanup(self)
1979 srv = choose_local_server(self.get_servers())
1981 lctl.disconnect(srv)
1983 for (srv, r) in find_route(self.get_servers()):
1984 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1985 except CommandError, e:
1986 log(self.module_name, "cleanup failed: ", self.name)
1992 def __init__(self, db, uuid, fs_name):
1993 Client.__init__(self, db, uuid, 'mdc', fs_name)
1995 def permits_inactive(self):
1999 def __init__(self, db, uuid, fs_name):
2000 Client.__init__(self, db, uuid, 'osc', fs_name)
2002 def permits_inactive(self):
2005 def mgmtcli_name_for_uuid(uuid):
2006 return 'MGMTCLI_%s' % uuid
2008 class ManagementClient(Client):
2009 def __init__(self, db, uuid):
2010 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2011 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2012 module_dir = 'mgmt')
2015 def __init__(self, db):
2016 Module.__init__(self, 'COBD', db)
2017 self.real_uuid = self.db.get_first_ref('realobd')
2018 self.cache_uuid = self.db.get_first_ref('cacheobd')
2019 self.add_lustre_module('cobd' , 'cobd')
2021 # need to check /proc/mounts and /etc/mtab before
2022 # formatting anything.
2023 # FIXME: check if device is already formatted.
2025 if is_prepared(self.name):
2027 self.info(self.real_uuid, self.cache_uuid)
2028 lctl.newdev("cobd", self.name, self.uuid,
2029 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
2032 # virtual interface for OSC and LOV
2034 def __init__(self, db, uuid, fs_name, name_override = None):
2035 Module.__init__(self, 'VOSC', db)
2036 if db.get_class() == 'lov':
2037 self.osc = LOV(db, uuid, fs_name, name_override)
2039 self.osc = get_osc(db, uuid, fs_name)
2041 return self.osc.uuid
2043 return self.osc.name
2048 def load_module(self):
2049 self.osc.load_module()
2050 def cleanup_module(self):
2051 self.osc.cleanup_module()
2054 class ECHO_CLIENT(Module):
2055 def __init__(self,db):
2056 Module.__init__(self, 'ECHO_CLIENT', db)
2057 self.add_lustre_module('obdecho', 'obdecho')
2058 self.obd_uuid = self.db.get_first_ref('obd')
2059 obd = self.db.lookup(self.obd_uuid)
2060 self.uuid = generate_client_uuid(self.name)
2061 self.osc = VOSC(obd, self.uuid, self.name)
2064 if is_prepared(self.name):
2067 self.osc.prepare() # XXX This is so cheating. -p
2068 self.info(self.obd_uuid)
2070 lctl.newdev("echo_client", self.name, self.uuid,
2071 setup = self.osc.get_name())
2074 if is_prepared(self.name):
2075 Module.cleanup(self)
2078 def load_module(self):
2079 self.osc.load_module()
2080 Module.load_module(self)
2082 def cleanup_module(self):
2083 Module.cleanup_module(self)
2084 self.osc.cleanup_module()
2087 def generate_client_uuid(name):
2088 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2090 int(random.random() * 1048576),
2091 int(random.random() * 1048576))
2092 return client_uuid[:36]
2095 def my_rstrip(s, chars):
2096 """my_rstrip(s, chars) -> strips any instances of the characters
2097 found in chars from the right side of string s"""
2098 # XXX required because python versions pre 2.2.3 don't allow
2099 #string.rstrip() to take alternate char lists
2103 ns = string.rstrip(s, '/')
2104 except TypeError, e:
2105 for i in range(len(s) - 1, 0, -1):
2114 class Mountpoint(Module):
2115 def __init__(self,db):
2116 Module.__init__(self, 'MTPT', db)
2117 self.path = my_rstrip(self.db.get_val('path'), '/')
2118 self.clientoptions = self.db.get_val('clientoptions', '')
2119 self.fs_uuid = self.db.get_first_ref('filesystem')
2120 fs = self.db.lookup(self.fs_uuid)
2121 self.mds_uuid = fs.get_first_ref('mds')
2122 self.obd_uuid = fs.get_first_ref('obd')
2123 self.mgmt_uuid = fs.get_first_ref('mgmt')
2124 obd = self.db.lookup(self.obd_uuid)
2125 client_uuid = generate_client_uuid(self.name)
2126 self.vosc = VOSC(obd, client_uuid, self.name)
2127 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
2129 self.add_lustre_module('mdc', 'mdc')
2130 self.add_lustre_module('llite', 'llite')
2132 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2138 if fs_is_mounted(self.path):
2139 log(self.path, "already mounted.")
2143 self.mgmtcli.prepare()
2146 mdc_name = self.mdc.name
2148 self.info(self.path, self.mds_uuid, self.obd_uuid)
2149 if config.record or config.lctl_dump:
2150 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
2153 if config.clientoptions:
2154 if self.clientoptions:
2155 self.clientoptions = self.clientoptions + ',' + config.clientoptions
2157 self.clientoptions = config.clientoptions
2158 if self.clientoptions:
2159 self.clientoptions = ',' + self.clientoptions
2160 # Linux kernel will deal with async and not pass it to ll_fill_super,
2161 # so replace it with Lustre async
2162 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
2164 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2165 (self.vosc.get_name(), mdc_name, self.clientoptions, config.config, self.path)
2166 run("mkdir", self.path)
2171 panic("mount failed:", self.path, ":", string.join(val))
2174 self.info(self.path, self.mds_uuid,self.obd_uuid)
2176 if config.record or config.lctl_dump:
2177 lctl.del_mount_option(local_node_name)
2179 if fs_is_mounted(self.path):
2181 (rc, out) = run("umount", "-f", self.path)
2183 (rc, out) = run("umount", self.path)
2185 raise CommandError('umount', out, rc)
2187 if fs_is_mounted(self.path):
2188 panic("fs is still mounted:", self.path)
2193 self.mgmtcli.cleanup()
2195 def load_module(self):
2197 self.mgmtcli.load_module()
2198 self.vosc.load_module()
2199 Module.load_module(self)
2201 def cleanup_module(self):
2202 Module.cleanup_module(self)
2203 self.vosc.cleanup_module()
2205 self.mgmtcli.cleanup_module()
2208 # ============================================================
2209 # misc query functions
2211 def get_ost_net(self, osd_uuid):
2215 osd = self.lookup(osd_uuid)
2216 node_uuid = osd.get_first_ref('node')
2217 node = self.lookup(node_uuid)
2219 panic("unable to find node for osd_uuid:", osd_uuid,
2220 " node_ref:", node_uuid)
2221 for net_uuid in node.get_networks():
2222 db = node.lookup(net_uuid)
2223 srv_list.append(Network(db))
2227 # the order of iniitailization is based on level.
2228 def getServiceLevel(self):
2229 type = self.get_class()
2231 if type in ('network',):
2233 elif type in ('routetbl',):
2235 elif type in ('ldlm',):
2237 elif type in ('mgmt',):
2239 elif type in ('osd', 'cobd'):
2241 elif type in ('mdsdev',):
2243 elif type in ('mountpoint', 'echoclient'):
2246 panic("Unknown type: ", type)
2248 if ret < config.minlevel or ret > config.maxlevel:
2253 # return list of services in a profile. list is a list of tuples
2254 # [(level, db_object),]
2255 def getServices(self):
2257 for ref_class, ref_uuid in self.get_all_refs():
2258 servdb = self.lookup(ref_uuid)
2260 level = getServiceLevel(servdb)
2262 list.append((level, servdb))
2264 panic('service not found: ' + ref_uuid)
2270 ############################################################
2272 # FIXME: clean this mess up!
2274 # OSC is no longer in the xml, so we have to fake it.
2275 # this is getting ugly and begging for another refactoring
2276 def get_osc(ost_db, uuid, fs_name):
2277 osc = OSC(ost_db, uuid, fs_name)
2280 def get_mdc(db, uuid, fs_name, mds_uuid):
2281 mds_db = db.lookup(mds_uuid);
2283 panic("no mds:", mds_uuid)
2284 mdc = MDC(mds_db, uuid, fs_name)
2287 ############################################################
2288 # routing ("rooting")
2290 # list of (nettype, cluster_id, nid)
2293 def find_local_clusters(node_db):
2294 global local_clusters
2295 for netuuid in node_db.get_networks():
2296 net = node_db.lookup(netuuid)
2298 debug("add_local", netuuid)
2299 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2301 if not acceptors.has_key(srv.port):
2302 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
2304 # This node is a gateway.
2306 def node_is_router():
2309 # If there are any routers found in the config, then this will be true
2310 # and all nodes will load kptlrouter.
2312 def node_needs_router():
2313 return needs_router or is_router
2315 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2316 # Currently, these local routes are only added to kptlrouter route
2317 # table if they are needed to connect to a specific server. This
2318 # should be changed so all available routes are loaded, and the
2319 # ptlrouter can make all the decisions.
2322 def find_local_routes(lustre):
2323 """ Scan the lustre config looking for routers . Build list of
2325 global local_routes, needs_router
2327 list = lustre.lookup_class('node')
2329 if router.get_val_int('router', 0):
2331 for (local_type, local_cluster_id, local_nid) in local_clusters:
2333 for netuuid in router.get_networks():
2334 db = router.lookup(netuuid)
2335 if (local_type == db.get_val('nettype') and
2336 local_cluster_id == db.get_val('clusterid')):
2337 gw = db.get_val('nid')
2340 debug("find_local_routes: gw is", gw)
2341 for route in router.get_local_routes(local_type, gw):
2342 local_routes.append(route)
2343 debug("find_local_routes:", local_routes)
2346 def choose_local_server(srv_list):
2347 for srv in srv_list:
2348 if local_cluster(srv.net_type, srv.cluster_id):
2351 def local_cluster(net_type, cluster_id):
2352 for cluster in local_clusters:
2353 if net_type == cluster[0] and cluster_id == cluster[1]:
2357 def local_interface(net_type, cluster_id, nid):
2358 for cluster in local_clusters:
2359 if (net_type == cluster[0] and cluster_id == cluster[1]
2360 and nid == cluster[2]):
2364 def find_route(srv_list):
2366 frm_type = local_clusters[0][0]
2367 for srv in srv_list:
2368 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2369 to_type = srv.net_type
2371 cluster_id = srv.cluster_id
2372 debug ('looking for route to', to_type, to)
2373 for r in local_routes:
2374 debug("find_route: ", r)
2375 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2376 result.append((srv, r))
2379 def get_active_target(db):
2380 target_uuid = db.getUUID()
2381 target_name = db.getName()
2382 node_name = get_select(target_name)
2384 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2386 tgt_dev_uuid = db.get_first_ref('active')
2389 def get_server_by_nid_uuid(db, nid_uuid):
2390 for n in db.lookup_class("network"):
2392 if net.nid_uuid == nid_uuid:
2396 ############################################################
2400 type = db.get_class()
2401 debug('Service:', type, db.getName(), db.getUUID())
2406 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2407 elif type == 'network':
2409 elif type == 'routetbl':
2413 elif type == 'cobd':
2415 elif type == 'mdsdev':
2417 elif type == 'mountpoint':
2419 elif type == 'echoclient':
2421 elif type == 'mgmt':
2424 panic("unknown service type:", type)
2428 # Prepare the system to run lustre using a particular profile
2429 # in a the configuration.
2430 # * load & the modules
2431 # * setup networking for the current node
2432 # * make sure partitions are in place and prepared
2433 # * initialize devices with lctl
2434 # Levels is important, and needs to be enforced.
2435 def for_each_profile(db, prof_list, operation):
2436 for prof_uuid in prof_list:
2437 prof_db = db.lookup(prof_uuid)
2439 panic("profile:", prof_uuid, "not found.")
2440 services = getServices(prof_db)
2443 def doWriteconf(services):
2447 if s[1].get_class() == 'mdsdev':
2448 n = newService(s[1])
2451 def doSetup(services):
2455 n = newService(s[1])
2458 def doModules(services):
2462 n = newService(s[1])
2465 def doCleanup(services):
2470 n = newService(s[1])
2471 if n.safe_to_clean():
2474 def doUnloadModules(services):
2479 n = newService(s[1])
2480 if n.safe_to_clean_modules():
2485 def doHost(lustreDB, hosts):
2486 global is_router, local_node_name
2489 node_db = lustreDB.lookup_name(h, 'node')
2493 panic('No host entry found.')
2495 local_node_name = node_db.get_val('name', 0)
2496 is_router = node_db.get_val_int('router', 0)
2497 lustre_upcall = node_db.get_val('lustreUpcall', '')
2498 portals_upcall = node_db.get_val('portalsUpcall', '')
2499 timeout = node_db.get_val_int('timeout', 0)
2500 ptldebug = node_db.get_val('ptldebug', '')
2501 subsystem = node_db.get_val('subsystem', '')
2503 find_local_clusters(node_db)
2505 find_local_routes(lustreDB)
2507 # Two step process: (1) load modules, (2) setup lustre
2508 # if not cleaning, load modules first.
2509 prof_list = node_db.get_refs('profile')
2511 if config.write_conf:
2512 for_each_profile(node_db, prof_list, doModules)
2514 for_each_profile(node_db, prof_list, doWriteconf)
2515 for_each_profile(node_db, prof_list, doUnloadModules)
2518 elif config.recover:
2519 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2520 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2521 "--client_uuid <UUID> --conn_uuid <UUID>")
2522 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2524 elif config.cleanup:
2525 if not mod_loaded('portals'):
2529 # the command line can override this value
2531 # ugly hack, only need to run lctl commands for --dump
2532 if config.lctl_dump or config.record:
2533 for_each_profile(node_db, prof_list, doCleanup)
2536 sys_set_timeout(timeout)
2537 sys_set_ptldebug(ptldebug)
2538 sys_set_subsystem(subsystem)
2539 sys_set_lustre_upcall(lustre_upcall)
2540 sys_set_portals_upcall(portals_upcall)
2542 for_each_profile(node_db, prof_list, doCleanup)
2543 for_each_profile(node_db, prof_list, doUnloadModules)
2547 # ugly hack, only need to run lctl commands for --dump
2548 if config.lctl_dump or config.record:
2549 sys_set_timeout(timeout)
2550 sys_set_lustre_upcall(lustre_upcall)
2551 for_each_profile(node_db, prof_list, doSetup)
2555 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2556 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2558 for_each_profile(node_db, prof_list, doModules)
2560 sys_set_debug_path()
2561 sys_set_ptldebug(ptldebug)
2562 sys_set_subsystem(subsystem)
2563 script = config.gdb_script
2564 run(lctl.lctl, ' modules >', script)
2566 log ("The GDB module script is in", script)
2567 # pause, so user has time to break and
2570 sys_set_timeout(timeout)
2571 sys_set_lustre_upcall(lustre_upcall)
2572 sys_set_portals_upcall(portals_upcall)
2574 for_each_profile(node_db, prof_list, doSetup)
2577 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2578 tgt = lustreDB.lookup(tgt_uuid)
2580 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2581 new_uuid = get_active_target(tgt)
2583 raise Lustre.LconfError("doRecovery: no active target found for: " +
2585 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2587 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2589 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2591 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2594 lctl.disconnect(oldnet)
2595 except CommandError, e:
2596 log("recover: disconnect", nid_uuid, "failed: ")
2601 except CommandError, e:
2602 log("recover: connect failed")
2605 lctl.recover(client_uuid, net.nid_uuid)
2608 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2609 base = os.path.dirname(cmd)
2610 if development_mode():
2611 if not config.lustre:
2612 debug('using objdir module paths')
2613 config.lustre = (os.path.join(base, ".."))
2614 # normalize the portals dir, using command line arg if set
2616 portals_dir = config.portals
2617 dir = os.path.join(config.lustre, portals_dir)
2618 config.portals = dir
2619 debug('config.portals', config.portals)
2620 elif config.lustre and config.portals:
2622 # if --lustre and --portals, normalize portals
2623 # can ignore POTRALS_DIR here, since it is probly useless here
2624 config.portals = os.path.join(config.lustre, config.portals)
2625 debug('config.portals B', config.portals)
2627 def sysctl(path, val):
2628 debug("+ sysctl", path, val)
2632 fp = open(os.path.join('/proc/sys', path), 'w')
2639 def sys_set_debug_path():
2640 sysctl('portals/debug_path', config.debug_path)
2642 def validate_upcall(upcall):
2644 if upcall in ('DEFAULT',):
2646 elif os.path.exists(upcall):
2647 if not os.access(upcall, os.X_OK):
2648 print "WARNING upcall script not executable: %s" % upcall
2650 print "WARNING invalid upcall script specified: %s" % upcall
2652 def sys_set_lustre_upcall(upcall):
2653 # the command overrides the value in the node config
2654 if config.lustre_upcall:
2655 upcall = config.lustre_upcall
2657 upcall = config.upcall
2659 validate_upcall(upcall)
2660 lctl.set_lustre_upcall(upcall)
2662 def sys_set_portals_upcall(upcall):
2663 # the command overrides the value in the node config
2664 if config.portals_upcall:
2665 upcall = config.portals_upcall
2667 upcall = config.upcall
2669 validate_upcall(upcall)
2670 sysctl('portals/upcall', upcall)
2672 def sys_set_timeout(timeout):
2673 # the command overrides the value in the node config
2674 if config.timeout and config.timeout > 0:
2675 timeout = config.timeout
2676 if timeout != None and timeout > 0:
2677 lctl.set_timeout(timeout)
2679 def sys_tweak_socknal ():
2680 if config.single_socket:
2681 sysctl("socknal/typed", 0)
2683 def sys_optimize_elan ():
2684 procfiles = ["/proc/elan/config/eventint_punt_loops",
2685 "/proc/qsnet/elan3/config/eventint_punt_loops",
2686 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2688 if os.access(p, os.W_OK):
2689 run ("echo 1 > " + p)
2691 def sys_set_ptldebug(ptldebug):
2693 ptldebug = config.ptldebug
2696 val = eval(ptldebug, ptldebug_names)
2697 val = "0x%x" % (val)
2698 sysctl('portals/debug', val)
2699 except NameError, e:
2702 def sys_set_subsystem(subsystem):
2703 if config.subsystem:
2704 subsystem = config.subsystem
2707 val = eval(subsystem, subsystem_names)
2708 val = "0x%x" % (val)
2709 sysctl('portals/subsystem_debug', val)
2710 except NameError, e:
2713 def sys_set_netmem_max(path, max):
2714 debug("setting", path, "to at least", max)
2722 fp = open(path, 'w')
2723 fp.write('%d\n' %(max))
2727 def sys_make_devices():
2728 if not os.access('/dev/portals', os.R_OK):
2729 run('mknod /dev/portals c 10 240')
2730 if not os.access('/dev/obd', os.R_OK):
2731 run('mknod /dev/obd c 10 241')
2734 # Add dir to the global PATH, if not already there.
2735 def add_to_path(new_dir):
2736 syspath = string.split(os.environ['PATH'], ':')
2737 if new_dir in syspath:
2739 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2741 def default_debug_path():
2742 path = '/tmp/lustre-log'
2743 if os.path.isdir('/r'):
2748 def default_gdb_script():
2749 script = '/tmp/ogdb'
2750 if os.path.isdir('/r'):
2751 return '/r' + script
2755 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2756 # ensure basic elements are in the system path
2757 def sanitise_path():
2758 for dir in DEFAULT_PATH:
2761 # global hack for the --select handling
2763 def init_select(args):
2764 # args = [service=nodeA,service2=nodeB service3=nodeC]
2767 list = string.split(arg, ',')
2769 srv, node = string.split(entry, '=')
2770 tgt_select[srv] = node
2772 def get_select(srv):
2773 if tgt_select.has_key(srv):
2774 return tgt_select[srv]
2778 FLAG = Lustre.Options.FLAG
2779 PARAM = Lustre.Options.PARAM
2780 INTPARAM = Lustre.Options.INTPARAM
2781 PARAMLIST = Lustre.Options.PARAMLIST
2783 ('verbose,v', "Print system commands as they are run"),
2784 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2785 ('config', "Cluster config name used for LDAP query", PARAM),
2786 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2787 ('node', "Load config for <nodename>", PARAM),
2788 ('cleanup,d', "Cleans up config. (Shutdown)"),
2789 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2791 ('single_socket', "socknal option: only use one socket instead of bundle",
2793 ('failover',"""Used to shut down without saving state.
2794 This will allow this node to "give up" a service to a
2795 another node for failover purposes. This will not
2796 be a clean shutdown.""",
2798 ('gdb', """Prints message after creating gdb module script
2799 and sleeps for 5 seconds."""),
2800 ('noexec,n', """Prints the commands and steps that will be run for a
2801 config without executing them. This can used to check if a
2802 config file is doing what it should be doing"""),
2803 ('nomod', "Skip load/unload module step."),
2804 ('nosetup', "Skip device setup/cleanup step."),
2805 ('reformat', "Reformat all devices (without question)"),
2806 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2807 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2808 ('clientoptions', "Additional options for Lustre", PARAM),
2809 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2811 ('write_conf', "Save all the client config information on mds."),
2812 ('record', "Write config information on mds."),
2813 ('record_log', "Name of config record log.", PARAM),
2814 ('record_device', "MDS device name that will record the config commands",
2816 ('minlevel', "Minimum level of services to configure/cleanup",
2818 ('maxlevel', """Maximum level of services to configure/cleanup
2819 Levels are aproximatly like:
2824 70 - mountpoint, echo_client, osc, mdc, lov""",
2826 ('lustre', """Base directory of lustre sources. This parameter will
2827 cause lconf to load modules from a source tree.""", PARAM),
2828 ('portals', """Portals source directory. If this is a relative path,
2829 then it is assumed to be relative to lustre. """, PARAM),
2830 ('timeout', "Set recovery timeout", INTPARAM),
2831 ('upcall', "Set both portals and lustre upcall script", PARAM),
2832 ('lustre_upcall', "Set lustre upcall script", PARAM),
2833 ('portals_upcall', "Set portals upcall script", PARAM),
2834 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2835 ('ptldebug', "Set the portals debug level", PARAM),
2836 ('subsystem', "Set the portals debug subsystem", PARAM),
2837 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2838 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2839 # Client recovery options
2840 ('recover', "Recover a device"),
2841 ('group', "The group of devices to configure or cleanup", PARAM),
2842 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2843 ('client_uuid', "The failed client (required for recovery)", PARAM),
2844 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2846 ('inactive', """The name of an inactive service, to be ignored during
2847 mounting (currently OST-only). Can be repeated.""",
2852 global lctl, config, toplustreDB, CONFIG_FILE
2854 # in the upcall this is set to SIG_IGN
2855 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2857 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2859 config, args = cl.parse(sys.argv[1:])
2860 except Lustre.OptionError, e:
2864 setupModulePath(sys.argv[0])
2866 host = socket.gethostname()
2868 # the PRNG is normally seeded with time(), which is not so good for starting
2869 # time-synchronized clusters
2870 input = open('/dev/urandom', 'r')
2872 print 'Unable to open /dev/urandom!'
2874 seed = input.read(32)
2880 init_select(config.select)
2883 # allow config to be fetched via HTTP, but only with python2
2884 if sys.version[0] != '1' and args[0].startswith('http://'):
2887 config_file = urllib2.urlopen(args[0])
2888 except (urllib2.URLError, socket.error), err:
2889 if hasattr(err, 'args'):
2891 print "Could not access '%s': %s" %(args[0], err)
2893 elif not os.access(args[0], os.R_OK):
2894 print 'File not found or readable:', args[0]
2898 config_file = open(args[0], 'r')
2900 dom = xml.dom.minidom.parse(config_file)
2902 panic("%s does not appear to be a config file." % (args[0]))
2903 sys.exit(1) # make sure to die here, even in debug mode.
2905 CONFIG_FILE = args[0]
2906 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2907 if not config.config:
2908 config.config = os.path.basename(args[0])# use full path?
2909 if config.config[-4:] == '.xml':
2910 config.config = config.config[:-4]
2911 elif config.ldapurl:
2912 if not config.config:
2913 panic("--ldapurl requires --config name")
2914 dn = "config=%s,fs=lustre" % (config.config)
2915 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2916 elif config.ptldebug or config.subsystem:
2917 sys_set_ptldebug(None)
2918 sys_set_subsystem(None)
2921 print 'Missing config file or ldap URL.'
2922 print 'see lconf --help for command summary'
2925 toplustreDB = lustreDB
2927 ver = lustreDB.get_version()
2929 panic("No version found in config data, please recreate.")
2930 if ver != Lustre.CONFIG_VERSION:
2931 panic("Config version", ver, "does not match lconf version",
2932 Lustre.CONFIG_VERSION)
2936 node_list.append(config.node)
2939 node_list.append(host)
2940 node_list.append('localhost')
2942 debug("configuring for host: ", node_list)
2945 config.debug_path = config.debug_path + '-' + host
2946 config.gdb_script = config.gdb_script + '-' + host
2948 lctl = LCTLInterface('lctl')
2950 if config.lctl_dump:
2951 lctl.use_save_file(config.lctl_dump)
2954 if not (config.record_device and config.record_log):
2955 panic("When recording, both --record_log and --record_device must be specified.")
2956 lctl.clear_log(config.record_device, config.record_log)
2957 lctl.record(config.record_device, config.record_log)
2959 doHost(lustreDB, node_list)
2964 if __name__ == "__main__":
2967 except Lustre.LconfError, e:
2969 # traceback.print_exc(file=sys.stdout)
2971 except CommandError, e:
2975 if first_cleanup_error:
2976 sys.exit(first_cleanup_error)