3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
92 "undefined" : (1 << 0),
102 "portals" : (1 << 10),
103 "socknal" : (1 << 11),
104 "qswnal" : (1 << 12),
105 "pinger" : (1 << 13),
106 "filter" : (1 << 14),
112 "ptlrouter" : (1 << 20),
118 first_cleanup_error = 0
119 def cleanup_error(rc):
120 global first_cleanup_error
121 if not first_cleanup_error:
122 first_cleanup_error = rc
124 # ============================================================
125 # debugging and error funcs
127 def fixme(msg = "this feature"):
128 raise Lustre.LconfError, msg + ' not implmemented yet.'
131 msg = string.join(map(str,args))
132 if not config.noexec:
133 raise Lustre.LconfError(msg)
138 msg = string.join(map(str,args))
143 print string.strip(s)
147 msg = string.join(map(str,args))
150 # ack, python's builtin int() does not support '0x123' syntax.
151 # eval can do it, although what a hack!
155 return eval(s, {}, {})
158 except SyntaxError, e:
159 raise ValueError("not a number")
161 raise ValueError("not a number")
163 # ============================================================
164 # locally defined exceptions
165 class CommandError (exceptions.Exception):
166 def __init__(self, cmd_name, cmd_err, rc=None):
167 self.cmd_name = cmd_name
168 self.cmd_err = cmd_err
173 if type(self.cmd_err) == types.StringType:
175 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
177 print "! %s: %s" % (self.cmd_name, self.cmd_err)
178 elif type(self.cmd_err) == types.ListType:
180 print "! %s (error %d):" % (self.cmd_name, self.rc)
182 print "! %s:" % (self.cmd_name)
183 for s in self.cmd_err:
184 print "> %s" %(string.strip(s))
189 # ============================================================
190 # handle daemons, like the acceptor
192 """ Manage starting and stopping a daemon. Assumes daemon manages
193 it's own pid file. """
195 def __init__(self, cmd):
201 log(self.command, "already running.")
203 self.path = find_prog(self.command)
205 panic(self.command, "not found.")
206 ret, out = runcmd(self.path +' '+ self.command_line())
208 raise CommandError(self.path, out, ret)
212 pid = self.read_pidfile()
214 log ("killing process", pid)
216 #time.sleep(1) # let daemon die
218 log("unable to kill", self.command, e)
220 log("unable to kill", self.command)
223 pid = self.read_pidfile()
233 def read_pidfile(self):
235 fp = open(self.pidfile(), 'r')
242 def clean_pidfile(self):
243 """ Remove a stale pidfile """
244 log("removing stale pidfile:", self.pidfile())
246 os.unlink(self.pidfile())
248 log(self.pidfile(), e)
250 class AcceptorHandler(DaemonHandler):
251 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
252 DaemonHandler.__init__(self, "acceptor")
255 self.send_mem = send_mem
256 self.recv_mem = recv_mem
259 self.flags = self.flags + ' -i'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 if config.lctl_dump or config.record:
273 for port in acceptors.keys():
274 daemon = acceptors[port]
275 if not daemon.running():
278 def run_one_acceptor(port):
279 if config.lctl_dump or config.record:
281 if acceptors.has_key(port):
282 daemon = acceptors[port]
283 if not daemon.running():
286 panic("run_one_acceptor: No acceptor defined for port:", port)
288 def stop_acceptor(port):
289 if acceptors.has_key(port):
290 daemon = acceptors[port]
295 # ============================================================
296 # handle lctl interface
299 Manage communication with lctl
302 def __init__(self, cmd):
304 Initialize close by finding the lctl binary.
306 self.lctl = find_prog(cmd)
308 self.record_device = ''
311 debug('! lctl not found')
314 raise CommandError('lctl', "unable to find lctl binary.")
316 def use_save_file(self, file):
317 self.save_file = file
319 def record(self, dev_name, logname):
320 log("Recording log", logname, "on", dev_name)
321 self.record_device = dev_name
322 self.record_log = logname
324 def end_record(self):
325 log("End recording log", self.record_log, "on", self.record_device)
326 self.record_device = None
327 self.record_log = None
329 def set_nonblock(self, fd):
330 fl = fcntl.fcntl(fd, F_GETFL)
331 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
336 the cmds are written to stdin of lctl
337 lctl doesn't return errors when run in script mode, so
339 should modify command line to accept multiple commands, or
340 create complex command line options
344 cmds = '\n dump ' + self.save_file + '\n' + cmds
345 elif self.record_device:
349 %s""" % (self.record_device, self.record_log, cmds)
351 debug("+", cmd_line, cmds)
352 if config.noexec: return (0, [])
354 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
355 child.tochild.write(cmds + "\n")
356 child.tochild.close()
357 # print "LCTL:", cmds
359 # From "Python Cookbook" from O'Reilly
360 outfile = child.fromchild
361 outfd = outfile.fileno()
362 self.set_nonblock(outfd)
363 errfile = child.childerr
364 errfd = errfile.fileno()
365 self.set_nonblock(errfd)
367 outdata = errdata = ''
370 ready = select.select([outfd,errfd],[],[]) # Wait for input
371 if outfd in ready[0]:
372 outchunk = outfile.read()
373 if outchunk == '': outeof = 1
374 outdata = outdata + outchunk
375 if errfd in ready[0]:
376 errchunk = errfile.read()
377 if errchunk == '': erreof = 1
378 errdata = errdata + errchunk
379 if outeof and erreof: break
380 # end of "borrowed" code
383 if os.WIFEXITED(ret):
384 rc = os.WEXITSTATUS(ret)
387 if rc or len(errdata):
388 raise CommandError(self.lctl, errdata, rc)
391 def runcmd(self, *args):
393 run lctl using the command line
395 cmd = string.join(map(str,args))
396 debug("+", self.lctl, cmd)
397 rc, out = run(self.lctl, cmd)
399 raise CommandError(self.lctl, out, rc)
403 def clear_log(self, dev, log):
404 """ clear an existing log """
409 quit """ % (dev, log)
412 def network(self, net, nid):
417 quit """ % (net, nid)
420 # create a new connection
421 def add_uuid(self, net_type, uuid, nid):
422 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
425 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
427 if net_type in ('tcp',) and not config.lctl_dump:
432 add_autoconn %s %s %d %s
436 nid, hostaddr, port, flags )
439 def connect(self, srv):
440 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
441 if srv.net_type in ('tcp',) and not config.lctl_dump:
445 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
446 srv.nid, srv.hostaddr, srv.port, flags)
449 def recover(self, dev_name, new_conn):
452 recover %s""" %(dev_name, new_conn)
455 # add a route to a range
456 def add_route(self, net, gw, lo, hi):
464 except CommandError, e:
468 def del_route(self, net, gw, lo, hi):
473 quit """ % (net, gw, lo, hi)
476 # add a route to a host
477 def add_route_host(self, net, uuid, gw, tgt):
478 self.add_uuid(net, uuid, tgt)
486 except CommandError, e:
490 # add a route to a range
491 def del_route_host(self, net, uuid, gw, tgt):
497 quit """ % (net, gw, tgt)
501 def del_autoconn(self, net_type, nid, hostaddr):
502 if net_type in ('tcp',) and not config.lctl_dump:
511 # disconnect one connection
512 def disconnect(self, srv):
513 self.del_uuid(srv.nid_uuid)
514 if srv.net_type in ('tcp',) and not config.lctl_dump:
515 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
517 def del_uuid(self, uuid):
525 def disconnectAll(self, net):
533 def attach(self, type, name, uuid):
536 quit""" % (type, name, uuid)
539 def setup(self, name, setup = ""):
543 quit""" % (name, setup)
547 # create a new device with lctl
548 def newdev(self, type, name, uuid, setup = ""):
549 self.attach(type, name, uuid);
551 self.setup(name, setup)
552 except CommandError, e:
553 self.cleanup(name, uuid, 0)
558 def cleanup(self, name, uuid, force, failover = 0):
559 if failover: force = 1
565 quit""" % (name, ('', 'force')[force],
566 ('', 'failover')[failover])
570 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
571 stripe_sz, stripe_off,
575 lov_setup %s %d %d %d %s %s
576 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
580 def lmv_setup(self, name, uuid, desc_uuid, devlist):
584 quit""" % (name, uuid, desc_uuid, devlist)
587 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
591 lov_setconfig %s %d %d %d %s %s
592 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
596 def dump(self, dump_file):
599 quit""" % (dump_file)
602 # get list of devices
603 def device_list(self):
604 devices = '/proc/fs/lustre/devices'
606 if os.access(devices, os.R_OK):
608 fp = open(devices, 'r')
616 def lustre_version(self):
617 rc, out = self.runcmd('version')
621 def mount_option(self, profile, osc, mdc):
623 mount_option %s %s %s
624 quit""" % (profile, osc, mdc)
627 # delete mount options
628 def del_mount_option(self, profile):
634 def set_timeout(self, timeout):
640 # delete mount options
641 def set_lustre_upcall(self, upcall):
646 # ============================================================
647 # Various system-level functions
648 # (ideally moved to their own module)
650 # Run a command and return the output and status.
651 # stderr is sent to /dev/null, could use popen3 to
652 # save it if necessary
655 if config.noexec: return (0, [])
656 f = os.popen(cmd + ' 2>&1')
666 cmd = string.join(map(str,args))
669 # Run a command in the background.
670 def run_daemon(*args):
671 cmd = string.join(map(str,args))
673 if config.noexec: return 0
674 f = os.popen(cmd + ' 2>&1')
682 # Determine full path to use for an external command
683 # searches dirname(argv[0]) first, then PATH
685 syspath = string.split(os.environ['PATH'], ':')
686 cmdpath = os.path.dirname(sys.argv[0])
687 syspath.insert(0, cmdpath);
689 syspath.insert(0, os.path.join(config.portals, 'utils/'))
691 prog = os.path.join(d,cmd)
692 if os.access(prog, os.X_OK):
696 # Recursively look for file starting at base dir
697 def do_find_file(base, mod):
698 fullname = os.path.join(base, mod)
699 if os.access(fullname, os.R_OK):
701 for d in os.listdir(base):
702 dir = os.path.join(base,d)
703 if os.path.isdir(dir):
704 module = do_find_file(dir, mod)
708 def find_module(src_dir, dev_dir, modname):
709 mod = '%s.o' % (modname)
710 module = src_dir +'/'+ dev_dir +'/'+ mod
712 if os.access(module, os.R_OK):
718 # is the path a block device?
725 return stat.S_ISBLK(s[stat.ST_MODE])
727 # build fs according to type
729 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
735 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
737 # devsize is in 1k, and fs block count is in 4k
738 block_cnt = devsize/4
740 if fstype in ('ext3', 'extN'):
741 # ext3 journal size is in megabytes
744 if not is_block(dev):
745 ret, out = runcmd("ls -l %s" %dev)
746 devsize = int(string.split(out[0])[4]) / 1024
748 ret, out = runcmd("sfdisk -s %s" %dev)
749 devsize = int(out[0])
750 if devsize > 1024 * 1024:
751 jsize = ((devsize / 102400) * 4)
754 if jsize: jopt = "-J size=%d" %(jsize,)
755 if isize: iopt = "-I %d" %(isize,)
756 mkfs = 'mkfs.ext2 -j -b 4096 '
757 if not isblock or config.force:
759 elif fstype == 'reiserfs':
760 # reiserfs journal size is in blocks
761 if jsize: jopt = "--journal_size %d" %(jsize,)
762 mkfs = 'mkreiserfs -ff'
764 panic('unsupported fs type: ', fstype)
766 if config.mkfsoptions != None:
767 mkfs = mkfs + ' ' + config.mkfsoptions
768 if mkfsoptions != None:
769 mkfs = mkfs + ' ' + mkfsoptions
770 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
772 panic("Unable to build fs:", dev, string.join(out))
773 # enable hash tree indexing on fsswe
774 if fstype in ('ext3', 'extN'):
775 htree = 'echo "feature FEATURE_C5" | debugfs -w'
776 (ret, out) = run (htree, dev)
778 panic("Unable to enable htree:", dev)
780 # some systems use /dev/loopN, some /dev/loop/N
784 if not os.access(loop + str(0), os.R_OK):
786 if not os.access(loop + str(0), os.R_OK):
787 panic ("can't access loop devices")
790 # find loop device assigned to thefile
793 for n in xrange(0, MAX_LOOP_DEVICES):
795 if os.access(dev, os.R_OK):
796 (stat, out) = run('losetup', dev)
797 if out and stat == 0:
798 m = re.search(r'\((.*)\)', out[0])
799 if m and file == m.group(1):
805 # create file if necessary and assign the first free loop device
806 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
807 dev = find_loop(file)
809 print 'WARNING file:', file, 'already mapped to', dev
811 if reformat or not os.access(file, os.R_OK | os.W_OK):
813 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
814 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
817 panic("Unable to create backing store:", file)
818 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
821 # find next free loop
822 for n in xrange(0, MAX_LOOP_DEVICES):
824 if os.access(dev, os.R_OK):
825 (stat, out) = run('losetup', dev)
827 run('losetup', dev, file)
830 print "out of loop devices"
832 print "out of loop devices"
835 # undo loop assignment
836 def clean_loop(file):
837 dev = find_loop(file)
839 ret, out = run('losetup -d', dev)
841 log('unable to clean loop device:', dev, 'for file:', file)
844 # determine if dev is formatted as a <fstype> filesystem
845 def need_format(fstype, dev):
846 # FIXME don't know how to implement this
849 # initialize a block device if needed
850 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
851 inode_size, mkfsoptions):
852 if config.noexec: return dev
853 if not is_block(dev):
854 dev = init_loop(dev, size, fstype, journal_size, inode_size,
855 mkfsoptions, reformat)
856 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
857 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
860 # panic("device:", dev,
861 # "not prepared, and autoformat is not set.\n",
862 # "Rerun with --reformat option to format ALL filesystems")
867 """lookup IP address for an interface"""
868 rc, out = run("/sbin/ifconfig", iface)
871 addr = string.split(out[1])[1]
872 ip = string.split(addr, ':')[1]
875 def sys_get_elan_position_file():
876 procfiles = ["/proc/elan/device0/position",
877 "/proc/qsnet/elan4/device0/position",
878 "/proc/qsnet/elan3/device0/position"]
880 if os.access(p, os.R_OK):
884 def sys_get_local_nid(net_type, wildcard, cluster_id):
885 """Return the local nid."""
887 if sys_get_elan_position_file():
888 local = sys_get_local_address('elan', '*', cluster_id)
890 local = sys_get_local_address(net_type, wildcard, cluster_id)
893 def sys_get_local_address(net_type, wildcard, cluster_id):
894 """Return the local address for the network type."""
896 if net_type in ('tcp',):
898 iface, star = string.split(wildcard, ':')
899 local = if2addr(iface)
901 panic ("unable to determine ip for:", wildcard)
903 host = socket.gethostname()
904 local = socket.gethostbyname(host)
905 elif net_type == 'elan':
906 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
907 f = sys_get_elan_position_file()
909 panic ("unable to determine local Elan ID")
912 lines = fp.readlines()
920 nid = my_int(cluster_id) + my_int(elan_id)
922 except ValueError, e:
926 elif net_type == 'gm':
927 fixme("automatic local address for GM")
928 elif net_type == 'scimac':
929 scinode="/opt/scali/sbin/scinode"
930 if os.path.exists(scinode):
931 (rc,local) = run(scinode)
933 panic (scinode, " not found on node with scimac networking")
935 panic (scinode, " failed")
936 local=string.rstrip(local[0])
940 def mod_loaded(modname):
941 """Check if a module is already loaded. Look in /proc/modules for it."""
943 fp = open('/proc/modules')
944 lines = fp.readlines()
946 # please forgive my tired fingers for this one
947 ret = filter(lambda word, mod=modname: word == mod,
948 map(lambda line: string.split(line)[0], lines))
953 # XXX: instead of device_list, ask for $name and see what we get
954 def is_prepared(name):
955 """Return true if a device exists for the name"""
958 if (config.noexec or config.record) and config.cleanup:
961 # expect this format:
962 # 1 UP ldlm ldlm ldlm_UUID 2
963 out = lctl.device_list()
965 if name == string.split(s)[3]:
967 except CommandError, e:
971 def is_network_prepared():
972 """If the any device exists, then assume that all networking
973 has been configured"""
974 out = lctl.device_list()
977 def fs_is_mounted(path):
978 """Return true if path is a mounted lustre filesystem"""
980 fp = open('/proc/mounts')
981 lines = fp.readlines()
985 if a[1] == path and a[2] == 'lustre_lite':
993 """Manage kernel modules"""
994 def __init__(self, lustre_dir, portals_dir):
995 self.lustre_dir = lustre_dir
996 self.portals_dir = portals_dir
997 self.kmodule_list = []
999 def add_portals_module(self, dev_dir, modname):
1000 """Append a module to list of modules to load."""
1001 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1003 def add_lustre_module(self, dev_dir, modname):
1004 """Append a module to list of modules to load."""
1005 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1007 def load_module(self):
1008 """Load all the modules in the list in the order they appear."""
1009 for src_dir, dev_dir, mod in self.kmodule_list:
1010 if mod_loaded(mod) and not config.noexec:
1012 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1014 module = find_module(src_dir, dev_dir, mod)
1016 panic('module not found:', mod)
1017 (rc, out) = run('/sbin/insmod', module)
1019 raise CommandError('insmod', out, rc)
1021 (rc, out) = run('/sbin/modprobe', mod)
1023 raise CommandError('modprobe', out, rc)
1025 def cleanup_module(self):
1026 """Unload the modules in the list in reverse order."""
1027 rev = self.kmodule_list
1029 for src_dir, dev_dir, mod in rev:
1030 if not mod_loaded(mod) and not config.noexec:
1033 if mod == 'portals' and config.dump:
1034 lctl.dump(config.dump)
1035 log('unloading module:', mod)
1036 (rc, out) = run('/sbin/rmmod', mod)
1038 log('! unable to unload module:', mod)
1041 # ============================================================
1042 # Classes to prepare and cleanup the various objects
1045 """ Base class for the rest of the modules. The default cleanup method is
1046 defined here, as well as some utilitiy funcs.
1048 def __init__(self, module_name, db):
1050 self.module_name = module_name
1051 self.name = self.db.getName()
1052 self.uuid = self.db.getUUID()
1055 self.kmod = kmod(config.lustre, config.portals)
1057 def info(self, *args):
1058 msg = string.join(map(str,args))
1059 print self.module_name + ":", self.name, self.uuid, msg
1062 """ default cleanup, used for most modules """
1065 lctl.cleanup(self.name, self.uuid, config.force)
1066 except CommandError, e:
1067 log(self.module_name, "cleanup failed: ", self.name)
1071 def add_portals_module(self, dev_dir, modname):
1072 """Append a module to list of modules to load."""
1073 self.kmod.add_portals_module(dev_dir, modname)
1075 def add_lustre_module(self, dev_dir, modname):
1076 """Append a module to list of modules to load."""
1077 self.kmod.add_lustre_module(dev_dir, modname)
1079 def load_module(self):
1080 """Load all the modules in the list in the order they appear."""
1081 self.kmod.load_module()
1083 def cleanup_module(self):
1084 """Unload the modules in the list in reverse order."""
1085 if self.safe_to_clean():
1086 self.kmod.cleanup_module()
1088 def safe_to_clean(self):
1091 def safe_to_clean_modules(self):
1092 return self.safe_to_clean()
1094 class Network(Module):
1095 def __init__(self,db):
1096 Module.__init__(self, 'NETWORK', db)
1097 self.net_type = self.db.get_val('nettype')
1098 self.nid = self.db.get_val('nid', '*')
1099 self.cluster_id = self.db.get_val('clusterid', "0")
1100 self.port = self.db.get_val_int('port', 0)
1101 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1102 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1103 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1106 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1108 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1109 self.generic_nid = 1
1110 debug("nid:", self.nid)
1112 self.generic_nid = 0
1114 self.nid_uuid = self.nid_to_uuid(self.nid)
1116 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1117 if '*' in self.hostaddr:
1118 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1119 if not self.hostaddr:
1120 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1121 debug("hostaddr:", self.hostaddr)
1123 self.add_portals_module("libcfs", 'libcfs')
1124 self.add_portals_module("portals", 'portals')
1125 if node_needs_router():
1126 self.add_portals_module("router", 'kptlrouter')
1127 if self.net_type == 'tcp':
1128 self.add_portals_module("knals/socknal", 'ksocknal')
1129 if self.net_type == 'elan':
1130 self.add_portals_module("knals/qswnal", 'kqswnal')
1131 if self.net_type == 'gm':
1132 self.add_portals_module("knals/gmnal", 'kgmnal')
1133 if self.net_type == 'scimac':
1134 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1136 def nid_to_uuid(self, nid):
1137 return "NID_%s_UUID" %(nid,)
1140 if is_network_prepared():
1142 self.info(self.net_type, self.nid, self.port)
1143 if not (config.record and self.generic_nid):
1144 lctl.network(self.net_type, self.nid)
1145 if self.net_type == 'tcp':
1147 if self.net_type == 'elan':
1149 if self.port and node_is_router():
1150 run_one_acceptor(self.port)
1151 self.connect_peer_gateways()
1153 def connect_peer_gateways(self):
1154 for router in self.db.lookup_class('node'):
1155 if router.get_val_int('router', 0):
1156 for netuuid in router.get_networks():
1157 net = self.db.lookup(netuuid)
1159 if (gw.cluster_id == self.cluster_id and
1160 gw.net_type == self.net_type):
1161 if gw.nid != self.nid:
1164 def disconnect_peer_gateways(self):
1165 for router in self.db.lookup_class('node'):
1166 if router.get_val_int('router', 0):
1167 for netuuid in router.get_networks():
1168 net = self.db.lookup(netuuid)
1170 if (gw.cluster_id == self.cluster_id and
1171 gw.net_type == self.net_type):
1172 if gw.nid != self.nid:
1175 except CommandError, e:
1176 print "disconnect failed: ", self.name
1180 def safe_to_clean(self):
1181 return not is_network_prepared()
1184 self.info(self.net_type, self.nid, self.port)
1186 stop_acceptor(self.port)
1187 if node_is_router():
1188 self.disconnect_peer_gateways()
1190 def correct_level(self, level, op=None):
1193 class RouteTable(Module):
1194 def __init__(self,db):
1195 Module.__init__(self, 'ROUTES', db)
1197 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1199 # only setup connections for tcp NALs
1201 if not net_type in ('tcp',):
1204 # connect to target if route is to single node and this node is the gw
1205 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1206 if not local_cluster(net_type, tgt_cluster_id):
1207 panic("target", lo, " not on the local cluster")
1208 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1209 # connect to gateway if this node is not the gw
1210 elif (local_cluster(net_type, gw_cluster_id)
1211 and not local_interface(net_type, gw_cluster_id, gw)):
1212 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1217 panic("no server for nid", lo)
1220 return Network(srvdb)
1223 if is_network_prepared():
1226 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1227 lctl.add_route(net_type, gw, lo, hi)
1228 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1232 def safe_to_clean(self):
1233 return not is_network_prepared()
1236 if is_network_prepared():
1237 # the network is still being used, don't clean it up
1239 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1240 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1243 lctl.disconnect(srv)
1244 except CommandError, e:
1245 print "disconnect failed: ", self.name
1250 lctl.del_route(net_type, gw, lo, hi)
1251 except CommandError, e:
1252 print "del_route failed: ", self.name
1256 class Management(Module):
1257 def __init__(self, db):
1258 Module.__init__(self, 'MGMT', db)
1259 self.add_lustre_module('lvfs', 'lvfs')
1260 self.add_lustre_module('obdclass', 'obdclass')
1261 self.add_lustre_module('ptlrpc', 'ptlrpc')
1262 self.add_lustre_module('mgmt', 'mgmt_svc')
1265 if is_prepared(self.name):
1268 lctl.newdev("mgmt", self.name, self.uuid)
1270 def safe_to_clean(self):
1274 if is_prepared(self.name):
1275 Module.cleanup(self)
1277 def correct_level(self, level, op=None):
1280 # This is only needed to load the modules; the LDLM device
1281 # is now created automatically.
1283 def __init__(self,db):
1284 Module.__init__(self, 'LDLM', db)
1285 self.add_lustre_module('lvfs', 'lvfs')
1286 self.add_lustre_module('obdclass', 'obdclass')
1287 self.add_lustre_module('ptlrpc', 'ptlrpc')
1295 def correct_level(self, level, op=None):
1300 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1301 Module.__init__(self, 'LOV', db)
1302 if name_override != None:
1303 self.name = "lov_%s" % name_override
1304 self.add_lustre_module('lov', 'lov')
1305 self.mds_uuid = self.db.get_first_ref('mds')
1306 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1307 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1308 self.pattern = self.db.get_val_int('stripepattern', 0)
1309 self.devlist = self.db.get_refs('obd')
1310 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1312 self.desc_uuid = self.uuid
1313 self.uuid = generate_client_uuid(self.name)
1314 self.fs_name = fs_name
1316 self.config_only = 1
1318 self.config_only = None
1319 mds= self.db.lookup(self.mds_uuid)
1320 self.mds_name = mds.getName()
1321 for obd_uuid in self.devlist:
1322 obd = self.db.lookup(obd_uuid)
1323 osc = get_osc(obd, self.uuid, fs_name)
1325 self.osclist.append(osc)
1327 panic('osc not found:', obd_uuid)
1330 if is_prepared(self.name):
1332 if self.config_only:
1333 panic("Can't prepare config_only LOV ", self.name)
1335 for osc in self.osclist:
1337 # Only ignore connect failures with --force, which
1338 # isn't implemented here yet.
1339 osc.prepare(ignore_connect_failure=0)
1340 except CommandError, e:
1341 print "Error preparing OSC %s\n" % osc.uuid
1343 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1344 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1345 lctl.lov_setup(self.name, self.uuid,
1346 self.desc_uuid, self.mds_name, self.stripe_cnt,
1347 self.stripe_sz, self.stripe_off, self.pattern,
1348 string.join(self.devlist))
1351 if is_prepared(self.name):
1352 Module.cleanup(self)
1353 if self.config_only:
1354 panic("Can't clean up config_only LOV ", self.name)
1355 for osc in self.osclist:
1358 def load_module(self):
1359 if self.config_only:
1360 panic("Can't load modules for config_only LOV ", self.name)
1361 for osc in self.osclist:
1364 Module.load_module(self)
1366 def cleanup_module(self):
1367 if self.config_only:
1368 panic("Can't cleanup modules for config_only LOV ", self.name)
1369 Module.cleanup_module(self)
1370 for osc in self.osclist:
1371 osc.cleanup_module()
1374 def correct_level(self, level, op=None):
1378 def __init__(self, db, uuid, fs_name, name_override = None):
1379 Module.__init__(self, 'LMV', db)
1380 if name_override != None:
1381 self.name = "lmv_%s" % name_override
1382 self.add_lustre_module('lmv', 'lmv')
1383 self.mds_uuid = self.db.get_first_ref('mds')
1384 mds = self.db.lookup(self.mds_uuid)
1385 self.lmv_name = mds.getName()
1386 self.devlist = self.db.get_refs('mds')
1388 self.desc_uuid = self.uuid
1390 self.fs_name = fs_name
1391 for mds_uuid in self.devlist:
1392 mds = self.db.lookup(mds_uuid)
1394 panic("MDS not found!")
1395 mdc = MDC(mds, self.uuid, fs_name)
1397 self.mdclist.append(mdc)
1399 panic('mdc not found:', mds_uuid)
1402 if is_prepared(self.name):
1404 for mdc in self.mdclist:
1406 # Only ignore connect failures with --force, which
1407 # isn't implemented here yet.
1408 mdc.prepare(ignore_connect_failure=0)
1409 except CommandError, e:
1410 print "Error preparing LMV %s\n" % mdc.uuid
1412 self.info(self.mds_uuid)
1413 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1414 string.join(self.devlist))
1417 for mdc in self.mdclist:
1419 if is_prepared(self.name):
1420 Module.cleanup(self)
1422 def load_module(self):
1423 for mdc in self.mdclist:
1426 Module.load_module(self)
1428 def cleanup_module(self):
1429 Module.cleanup_module(self)
1430 for mds in self.mdclist:
1431 mdc.cleanup_module()
1434 def correct_level(self, level, op=None):
1437 class MDSDEV(Module):
1438 def __init__(self,db):
1439 Module.__init__(self, 'MDSDEV', db)
1440 self.devpath = self.db.get_val('devpath','')
1441 self.size = self.db.get_val_int('devsize', 0)
1442 self.journal_size = self.db.get_val_int('journalsize', 0)
1443 self.fstype = self.db.get_val('fstype', '')
1444 self.nspath = self.db.get_val('nspath', '')
1445 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1446 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1447 target_uuid = self.db.get_first_ref('target')
1448 mds = self.db.lookup(target_uuid)
1449 self.name = mds.getName()
1450 self.filesystem_uuids = mds.get_refs('filesystem')
1452 self.master_mds = ""
1453 if not self.filesystem_uuids:
1454 self.lmv_uuid = self.db.get_first_ref('lmv')
1455 if not self.lmv_uuid:
1456 panic("ALERT: can't find lvm uuid")
1458 self.lmv = self.db.lookup(self.lmv_uuid)
1460 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1461 self.master_mds = self.lmv_uuid
1462 # FIXME: if fstype not set, then determine based on kernel version
1463 self.format = self.db.get_val('autoformat', "no")
1464 if mds.get_val('failover', 0):
1465 self.failover_mds = 'f'
1467 self.failover_mds = 'n'
1468 active_uuid = get_active_target(mds)
1470 panic("No target device found:", target_uuid)
1471 if active_uuid == self.uuid:
1475 if self.active and config.group and config.group != mds.get_val('group'):
1479 self.inode_size = self.db.get_val_int('inodesize', 0)
1480 if self.inode_size == 0:
1481 # find the LOV for this MDS
1482 lovconfig_uuid = mds.get_first_ref('lovconfig')
1483 if not lovconfig_uuid:
1484 if not self.lmv_uuid:
1485 panic("No LOV found for lovconfig ", lovconfig.name)
1486 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1487 lovconfig = self.lmv.lookup(lovconfig_uuid)
1488 lov_uuid = lovconfig.get_first_ref('lov')
1490 panic("No LOV found for lovconfig ", lovconfig.name)
1492 lovconfig = mds.lookup(lovconfig_uuid)
1493 lov_uuid = lovconfig.get_first_ref('lov')
1495 panic("No LOV found for lovconfig ", lovconfig.name)
1496 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1497 lovconfig = self.lmv.lookup(lovconfig_uuid)
1498 lov_uuid = lovconfig.get_first_ref('lov')
1500 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1502 # default stripe count controls default inode_size
1503 stripe_count = lov.stripe_cnt
1504 if stripe_count > 77:
1505 self.inode_size = 4096
1506 elif stripe_count > 35:
1507 self.inode_size = 2048
1508 elif stripe_count > 13:
1509 self.inode_size = 1024
1510 elif stripe_count > 3:
1511 self.inode_size = 512
1513 self.inode_size = 256
1515 self.target_dev_uuid = self.uuid
1516 self.uuid = target_uuid
1519 client_uuid = generate_client_uuid(self.name)
1520 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1521 self.master_mds = self.master.name
1523 self.add_lustre_module('mdc', 'mdc')
1524 self.add_lustre_module('osc', 'osc')
1525 self.add_lustre_module('lov', 'lov')
1526 self.add_lustre_module('lmv', 'lmv')
1527 self.add_lustre_module('mds', 'mds')
1529 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1531 def load_module(self):
1533 Module.load_module(self)
1536 if is_prepared(self.name):
1539 debug(self.uuid, "not active")
1542 # run write_conf automatically, if --reformat used
1544 self.info(self.devpath, self.fstype, self.size, self.format)
1548 self.master.prepare()
1549 # never reformat here
1550 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1551 self.format, self.journal_size, self.inode_size,
1553 if not is_prepared('MDT'):
1554 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1556 lctl.newdev("mds", self.name, self.uuid,
1557 setup ="%s %s %s %s"
1558 %(blkdev, self.fstype, self.name, self.master_mds))
1559 except CommandError, e:
1561 panic("MDS is missing the config log. Need to run " +
1562 "lconf --write_conf.")
1566 def write_conf(self):
1567 if is_prepared(self.name):
1569 self.info(self.devpath, self.fstype, self.format)
1570 blkdev = block_dev(self.devpath, self.size, self.fstype,
1571 config.reformat, self.format, self.journal_size,
1572 self.inode_size, self.mkfsoptions)
1573 lctl.newdev("mds", self.name, self.uuid,
1574 setup ="%s %s" %(blkdev, self.fstype))
1576 # record logs for the MDS lov
1577 for uuid in self.filesystem_uuids:
1578 log("recording clients for filesystem:", uuid)
1579 fs = self.db.lookup(uuid)
1580 obd_uuid = fs.get_first_ref('obd')
1581 client_uuid = generate_client_uuid(self.name)
1582 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1585 lctl.clear_log(self.name, self.name)
1586 lctl.record(self.name, self.name)
1588 lctl.mount_option(self.name, client.get_name(), "")
1592 lctl.clear_log(self.name, self.name + '-clean')
1593 lctl.record(self.name, self.name + '-clean')
1595 lctl.del_mount_option(self.name)
1600 # record logs for each client
1602 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1604 config_options = CONFIG_FILE
1606 for node_db in self.db.lookup_class('node'):
1607 client_name = node_db.getName()
1608 for prof_uuid in node_db.get_refs('profile'):
1609 prof_db = node_db.lookup(prof_uuid)
1610 # refactor this into a funtion to test "clientness"
1612 for ref_class, ref_uuid in prof_db.get_all_refs():
1613 if ref_class in ('mountpoint','echoclient'):
1614 debug("recording", client_name)
1615 old_noexec = config.noexec
1617 noexec_opt = ('', '-n')
1618 ret, out = run (sys.argv[0],
1619 noexec_opt[old_noexec == 1],
1620 " -v --record --nomod",
1621 "--record_log", client_name,
1622 "--record_device", self.name,
1623 "--node", client_name,
1626 for s in out: log("record> ", string.strip(s))
1627 ret, out = run (sys.argv[0],
1628 noexec_opt[old_noexec == 1],
1629 "--cleanup -v --record --nomod",
1630 "--record_log", client_name + "-clean",
1631 "--record_device", self.name,
1632 "--node", client_name,
1635 for s in out: log("record> ", string.strip(s))
1636 config.noexec = old_noexec
1638 lctl.cleanup(self.name, self.uuid, 0, 0)
1639 except CommandError, e:
1640 log(self.module_name, "cleanup failed: ", self.name)
1643 Module.cleanup(self)
1644 clean_loop(self.devpath)
1646 def msd_remaining(self):
1647 out = lctl.device_list()
1649 if string.split(s)[2] in ('mds',):
1652 def safe_to_clean(self):
1655 def safe_to_clean_modules(self):
1656 return not self.msd_remaining()
1660 debug(self.uuid, "not active")
1663 if is_prepared(self.name):
1665 lctl.cleanup(self.name, self.uuid, config.force,
1667 except CommandError, e:
1668 log(self.module_name, "cleanup failed: ", self.name)
1671 Module.cleanup(self)
1674 self.master.cleanup()
1675 if not self.msd_remaining() and is_prepared('MDT'):
1677 lctl.cleanup("MDT", "MDT_UUID", config.force,
1679 except CommandError, e:
1680 print "cleanup failed: ", self.name
1683 clean_loop(self.devpath)
1685 def correct_level(self, level, op=None):
1686 #if self.master_mds:
1691 def __init__(self, db):
1692 Module.__init__(self, 'OSD', db)
1693 self.osdtype = self.db.get_val('osdtype')
1694 self.devpath = self.db.get_val('devpath', '')
1695 self.size = self.db.get_val_int('devsize', 0)
1696 self.journal_size = self.db.get_val_int('journalsize', 0)
1697 self.inode_size = self.db.get_val_int('inodesize', 0)
1698 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1699 self.fstype = self.db.get_val('fstype', '')
1700 self.nspath = self.db.get_val('nspath', '')
1701 target_uuid = self.db.get_first_ref('target')
1702 ost = self.db.lookup(target_uuid)
1703 self.name = ost.getName()
1704 self.format = self.db.get_val('autoformat', 'yes')
1705 if ost.get_val('failover', 0):
1706 self.failover_ost = 'f'
1708 self.failover_ost = 'n'
1710 active_uuid = get_active_target(ost)
1712 panic("No target device found:", target_uuid)
1713 if active_uuid == self.uuid:
1717 if self.active and config.group and config.group != ost.get_val('group'):
1720 self.target_dev_uuid = self.uuid
1721 self.uuid = target_uuid
1723 self.add_lustre_module('ost', 'ost')
1724 # FIXME: should we default to ext3 here?
1726 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1727 self.add_lustre_module(self.osdtype, self.osdtype)
1729 def load_module(self):
1731 Module.load_module(self)
1733 # need to check /proc/mounts and /etc/mtab before
1734 # formatting anything.
1735 # FIXME: check if device is already formatted.
1737 if is_prepared(self.name):
1740 debug(self.uuid, "not active")
1742 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1743 self.format, self.journal_size, self.inode_size)
1745 if self.osdtype == 'obdecho':
1748 blkdev = block_dev(self.devpath, self.size, self.fstype,
1749 config.reformat, self.format, self.journal_size,
1750 self.inode_size, self.mkfsoptions)
1751 lctl.newdev(self.osdtype, self.name, self.uuid,
1752 setup ="%s %s %s" %(blkdev, self.fstype,
1754 if not is_prepared('OSS'):
1755 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1757 def osd_remaining(self):
1758 out = lctl.device_list()
1760 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1763 def safe_to_clean(self):
1766 def safe_to_clean_modules(self):
1767 return not self.osd_remaining()
1771 debug(self.uuid, "not active")
1773 if is_prepared(self.name):
1776 lctl.cleanup(self.name, self.uuid, config.force,
1778 except CommandError, e:
1779 log(self.module_name, "cleanup failed: ", self.name)
1782 if not self.osd_remaining() and is_prepared('OSS'):
1784 lctl.cleanup("OSS", "OSS_UUID", config.force,
1786 except CommandError, e:
1787 print "cleanup failed: ", self.name
1790 if not self.osdtype == 'obdecho':
1791 clean_loop(self.devpath)
1793 def correct_level(self, level, op=None):
1796 def mgmt_uuid_for_fs(mtpt_name):
1799 mtpt_db = toplevel.lookup_name(mtpt_name)
1800 fs_uuid = mtpt_db.get_first_ref('filesystem')
1801 fs = toplevel.lookup(fs_uuid)
1804 return fs.get_first_ref('mgmt')
1806 # Generic client module, used by OSC and MDC
1807 class Client(Module):
1808 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1810 self.target_name = tgtdb.getName()
1811 self.target_uuid = tgtdb.getUUID()
1814 self.tgt_dev_uuid = get_active_target(tgtdb)
1815 if not self.tgt_dev_uuid:
1816 panic("No target device found for target(1):", self.target_name)
1818 self.kmod = kmod(config.lustre, config.portals)
1822 self.module = module
1823 self.module_name = string.upper(module)
1825 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1826 self.target_name, fs_name)
1828 self.name = self_name
1830 self.lookup_server(self.tgt_dev_uuid)
1831 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1833 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1836 self.fs_name = fs_name
1839 self.add_lustre_module(module_dir, module)
1841 def lookup_server(self, srv_uuid):
1842 """ Lookup a server's network information """
1843 self._server_nets = get_ost_net(self.db, srv_uuid)
1844 if len(self._server_nets) == 0:
1845 panic ("Unable to find a server for:", srv_uuid)
1847 def get_servers(self):
1848 return self._server_nets
1850 def prepare(self, ignore_connect_failure = 0):
1851 self.info(self.target_uuid)
1852 if is_prepared(self.name):
1855 srv = choose_local_server(self.get_servers())
1859 routes = find_route(self.get_servers())
1860 if len(routes) == 0:
1861 panic ("no route to", self.target_uuid)
1862 for (srv, r) in routes:
1863 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1864 except CommandError, e:
1865 if not ignore_connect_failure:
1868 if self.target_uuid in config.inactive and self.permits_inactive():
1869 debug("%s inactive" % self.target_uuid)
1870 inactive_p = "inactive"
1872 debug("%s active" % self.target_uuid)
1874 lctl.newdev(self.module, self.name, self.uuid,
1875 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1876 inactive_p, self.mgmt_name))
1879 if is_prepared(self.name):
1880 Module.cleanup(self)
1882 srv = choose_local_server(self.get_servers())
1884 lctl.disconnect(srv)
1886 for (srv, r) in find_route(self.get_servers()):
1887 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1888 except CommandError, e:
1889 log(self.module_name, "cleanup failed: ", self.name)
1893 def correct_level(self, level, op=None):
1898 def __init__(self, db, uuid, fs_name):
1899 Client.__init__(self, db, uuid, 'mdc', fs_name)
1901 def permits_inactive(self):
1905 def __init__(self, db, uuid, fs_name):
1906 Client.__init__(self, db, uuid, 'osc', fs_name)
1908 def permits_inactive(self):
1911 def mgmtcli_name_for_uuid(uuid):
1912 return 'MGMTCLI_%s' % uuid
1914 class ManagementClient(Client):
1915 def __init__(self, db, uuid):
1916 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1917 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1918 module_dir = 'mgmt')
1921 def __init__(self, db):
1922 Module.__init__(self, 'COBD', db)
1923 self.real_uuid = self.db.get_first_ref('realobd')
1924 self.cache_uuid = self.db.get_first_ref('cacheobd')
1925 self.add_lustre_module('cobd' , 'cobd')
1927 # need to check /proc/mounts and /etc/mtab before
1928 # formatting anything.
1929 # FIXME: check if device is already formatted.
1931 if is_prepared(self.name):
1933 self.info(self.real_uuid, self.cache_uuid)
1934 lctl.newdev("cobd", self.name, self.uuid,
1935 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1937 def correct_level(self, level, op=None):
1940 # virtual interface for OSC and LOV
1942 def __init__(self, db, uuid, fs_name, name_override = None):
1943 Module.__init__(self, 'VOSC', db)
1944 if db.get_class() == 'lov':
1945 self.osc = LOV(db, uuid, fs_name, name_override)
1947 self.osc = get_osc(db, uuid, fs_name)
1949 return self.osc.uuid
1951 return self.osc.name
1956 def load_module(self):
1957 self.osc.load_module()
1958 def cleanup_module(self):
1959 self.osc.cleanup_module()
1960 def correct_level(self, level, op=None):
1961 return self.osc.correct_level(level, op)
1963 # virtual interface for MDC and LMV
1965 def __init__(self, db, uuid, fs_name, name_override = None):
1966 Module.__init__(self, 'VMDC', db)
1967 if db.get_class() == 'lmv':
1968 self.mdc = LMV(db, uuid, fs_name)
1970 self.mdc = MDC(db, uuid, fs_name)
1972 return self.mdc.uuid
1974 return self.mdc.name
1979 def load_module(self):
1980 self.mdc.load_module()
1981 def cleanup_module(self):
1982 self.mdc.cleanup_module()
1983 def correct_level(self, level, op=None):
1984 return self.osc.correct_level(level, op)
1987 class ECHO_CLIENT(Module):
1988 def __init__(self,db):
1989 Module.__init__(self, 'ECHO_CLIENT', db)
1990 self.add_lustre_module('obdecho', 'obdecho')
1991 self.obd_uuid = self.db.get_first_ref('obd')
1992 obd = self.db.lookup(self.obd_uuid)
1993 self.uuid = generate_client_uuid(self.name)
1994 self.osc = VOSC(obd, self.uuid, self.name)
1997 if is_prepared(self.name):
2000 self.osc.prepare() # XXX This is so cheating. -p
2001 self.info(self.obd_uuid)
2003 lctl.newdev("echo_client", self.name, self.uuid,
2004 setup = self.osc.get_name())
2007 if is_prepared(self.name):
2008 Module.cleanup(self)
2011 def load_module(self):
2012 self.osc.load_module()
2013 Module.load_module(self)
2015 def cleanup_module(self):
2016 Module.cleanup_module(self)
2017 self.osc.cleanup_module()
2019 def correct_level(self, level, op=None):
2022 def generate_client_uuid(name):
2023 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2025 int(random.random() * 1048576),
2026 int(random.random() * 1048576))
2027 return client_uuid[:36]
2030 class Mountpoint(Module):
2031 def __init__(self,db):
2032 Module.__init__(self, 'MTPT', db)
2033 self.path = self.db.get_val('path')
2034 self.fs_uuid = self.db.get_first_ref('filesystem')
2035 fs = self.db.lookup(self.fs_uuid)
2036 self.mds_uuid = fs.get_first_ref('lmv')
2037 if not self.mds_uuid:
2038 self.mds_uuid = fs.get_first_ref('mds')
2039 self.obd_uuid = fs.get_first_ref('obd')
2040 self.mgmt_uuid = fs.get_first_ref('mgmt')
2041 obd = self.db.lookup(self.obd_uuid)
2042 client_uuid = generate_client_uuid(self.name)
2043 self.vosc = VOSC(obd, client_uuid, self.name)
2044 self.mds = self.db.lookup(self.mds_uuid)
2046 panic("no mds: ", self.mds_uuid)
2047 self.add_lustre_module('mdc', 'mdc')
2048 self.add_lustre_module('lmv', 'lmv')
2049 self.vmdc = VMDC(self.mds, client_uuid, self.name, self.mds_uuid)
2050 self.mdc = self.vmdc.mdc
2051 self.add_lustre_module('llite', 'llite')
2053 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2059 if fs_is_mounted(self.path):
2060 log(self.path, "already mounted.")
2064 self.mgmtcli.prepare()
2067 mdc_name = self.mdc.name
2069 self.info(self.path, self.mds_uuid, self.obd_uuid)
2070 if config.record or config.lctl_dump:
2071 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
2073 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2074 (self.vosc.get_name(), mdc_name, config.config, self.path)
2075 run("mkdir", self.path)
2080 panic("mount failed:", self.path, ":", string.join(val))
2083 self.info(self.path, self.mds_uuid,self.obd_uuid)
2085 if config.record or config.lctl_dump:
2086 lctl.del_mount_option(local_node_name)
2088 if fs_is_mounted(self.path):
2090 (rc, out) = run("umount", "-f", self.path)
2092 (rc, out) = run("umount", self.path)
2094 raise CommandError('umount', out, rc)
2096 if fs_is_mounted(self.path):
2097 panic("fs is still mounted:", self.path)
2102 self.mgmtcli.cleanup()
2104 def load_module(self):
2106 self.mgmtcli.load_module()
2107 self.vosc.load_module()
2108 Module.load_module(self)
2110 def cleanup_module(self):
2111 Module.cleanup_module(self)
2112 self.vosc.cleanup_module()
2114 self.mgmtcli.cleanup_module()
2116 def correct_level(self, level, op=None):
2119 # ============================================================
2120 # misc query functions
2122 def get_ost_net(self, osd_uuid):
2126 osd = self.lookup(osd_uuid)
2127 node_uuid = osd.get_first_ref('node')
2128 node = self.lookup(node_uuid)
2130 panic("unable to find node for osd_uuid:", osd_uuid,
2131 " node_ref:", node_uuid_)
2132 for net_uuid in node.get_networks():
2133 db = node.lookup(net_uuid)
2134 srv_list.append(Network(db))
2138 # the order of iniitailization is based on level.
2139 def getServiceLevel(self):
2140 type = self.get_class()
2142 if type in ('network',):
2144 elif type in ('routetbl',):
2146 elif type in ('ldlm',):
2148 elif type in ('mgmt',):
2150 elif type in ('osd', 'cobd'):
2152 elif type in ('mdsdev',):
2154 elif type in ('lmv',):
2156 elif type in ('mountpoint', 'echoclient'):
2159 panic("Unknown type: ", type)
2161 if ret < config.minlevel or ret > config.maxlevel:
2166 # return list of services in a profile. list is a list of tuples
2167 # [(level, db_object),]
2168 def getServices(self):
2170 for ref_class, ref_uuid in self.get_all_refs():
2171 servdb = self.lookup(ref_uuid)
2173 level = getServiceLevel(servdb)
2175 list.append((level, servdb))
2177 panic('service not found: ' + ref_uuid)
2183 ############################################################
2185 # FIXME: clean this mess up!
2187 # OSC is no longer in the xml, so we have to fake it.
2188 # this is getting ugly and begging for another refactoring
2189 def get_osc(ost_db, uuid, fs_name):
2190 osc = OSC(ost_db, uuid, fs_name)
2193 def get_mdc(db, uuid, fs_name, mds_uuid):
2194 mds_db = db.lookup(mds_uuid);
2196 error("no mds:", mds_uuid)
2197 mdc = MDC(mds_db, mds_uuid, fs_name)
2200 ############################################################
2201 # routing ("rooting")
2202 # list of (nettype, cluster_id, nid)
2205 def find_local_clusters(node_db):
2206 global local_clusters
2207 for netuuid in node_db.get_networks():
2208 net = node_db.lookup(netuuid)
2210 debug("add_local", netuuid)
2211 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2213 if acceptors.has_key(srv.port):
2214 panic("duplicate port:", srv.port)
2215 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2216 srv.send_mem, srv.recv_mem,
2219 # This node is a gateway.
2221 def node_is_router():
2224 # If there are any routers found in the config, then this will be true
2225 # and all nodes will load kptlrouter.
2227 def node_needs_router():
2228 return needs_router or is_router
2230 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2231 # Currently, these local routes are only added to kptlrouter route
2232 # table if they are needed to connect to a specific server. This
2233 # should be changed so all available routes are loaded, and the
2234 # ptlrouter can make all the decisions.
2237 def find_local_routes(lustre):
2238 """ Scan the lustre config looking for routers . Build list of
2240 global local_routes, needs_router
2242 list = lustre.lookup_class('node')
2244 if router.get_val_int('router', 0):
2246 for (local_type, local_cluster_id, local_nid) in local_clusters:
2248 for netuuid in router.get_networks():
2249 db = router.lookup(netuuid)
2250 if (local_type == db.get_val('nettype') and
2251 local_cluster_id == db.get_val('clusterid')):
2252 gw = db.get_val('nid')
2255 debug("find_local_routes: gw is", gw)
2256 for route in router.get_local_routes(local_type, gw):
2257 local_routes.append(route)
2258 debug("find_local_routes:", local_routes)
2261 def choose_local_server(srv_list):
2262 for srv in srv_list:
2263 if local_cluster(srv.net_type, srv.cluster_id):
2266 def local_cluster(net_type, cluster_id):
2267 for cluster in local_clusters:
2268 if net_type == cluster[0] and cluster_id == cluster[1]:
2272 def local_interface(net_type, cluster_id, nid):
2273 for cluster in local_clusters:
2274 if (net_type == cluster[0] and cluster_id == cluster[1]
2275 and nid == cluster[2]):
2279 def find_route(srv_list):
2281 frm_type = local_clusters[0][0]
2282 for srv in srv_list:
2283 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2284 to_type = srv.net_type
2286 cluster_id = srv.cluster_id
2287 debug ('looking for route to', to_type, to)
2288 for r in local_routes:
2289 debug("find_route: ", r)
2290 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2291 result.append((srv, r))
2294 def get_active_target(db):
2295 target_uuid = db.getUUID()
2296 target_name = db.getName()
2297 node_name = get_select(target_name)
2299 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2301 tgt_dev_uuid = db.get_first_ref('active')
2304 def get_server_by_nid_uuid(db, nid_uuid):
2305 for n in db.lookup_class("network"):
2307 if net.nid_uuid == nid_uuid:
2311 ############################################################
2315 type = db.get_class()
2316 debug('Service:', type, db.getName(), db.getUUID())
2321 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2322 elif type == 'network':
2324 elif type == 'routetbl':
2328 elif type == 'cobd':
2330 elif type == 'mdsdev':
2332 elif type == 'mountpoint':
2334 elif type == 'echoclient':
2336 elif type == 'mgmt':
2341 panic ("unknown service type:", type)
2345 # Prepare the system to run lustre using a particular profile
2346 # in a the configuration.
2347 # * load & the modules
2348 # * setup networking for the current node
2349 # * make sure partitions are in place and prepared
2350 # * initialize devices with lctl
2351 # Levels is important, and needs to be enforced.
2352 def for_each_profile(db, prof_list, operation):
2353 for prof_uuid in prof_list:
2354 prof_db = db.lookup(prof_uuid)
2356 panic("profile:", profile, "not found.")
2357 services = getServices(prof_db)
2360 def doWriteconf(services):
2364 if s[1].get_class() == 'mdsdev':
2365 n = newService(s[1])
2368 def doSetup(services):
2373 n = newService(s[1])
2375 slist.append((n.level, n))
2378 nl = n[1].correct_level(n[0])
2379 nlist.append((nl, n[1]))
2384 def doModules(services):
2388 n = newService(s[1])
2391 def doCleanup(services):
2396 n = newService(s[1])
2398 slist.append((n.level, n))
2401 nl = n[1].correct_level(n[0])
2402 nlist.append((nl, n[1]))
2406 if n[1].safe_to_clean():
2409 def doUnloadModules(services):
2414 n = newService(s[1])
2415 if n.safe_to_clean_modules():
2420 def doHost(lustreDB, hosts):
2421 global is_router, local_node_name
2424 node_db = lustreDB.lookup_name(h, 'node')
2428 panic('No host entry found.')
2430 local_node_name = node_db.get_val('name', 0)
2431 is_router = node_db.get_val_int('router', 0)
2432 lustre_upcall = node_db.get_val('lustreUpcall', '')
2433 portals_upcall = node_db.get_val('portalsUpcall', '')
2434 timeout = node_db.get_val_int('timeout', 0)
2435 ptldebug = node_db.get_val('ptldebug', '')
2436 subsystem = node_db.get_val('subsystem', '')
2438 find_local_clusters(node_db)
2440 find_local_routes(lustreDB)
2442 # Two step process: (1) load modules, (2) setup lustre
2443 # if not cleaning, load modules first.
2444 prof_list = node_db.get_refs('profile')
2446 if config.write_conf:
2447 for_each_profile(node_db, prof_list, doModules)
2449 for_each_profile(node_db, prof_list, doWriteconf)
2450 for_each_profile(node_db, prof_list, doUnloadModules)
2452 elif config.recover:
2453 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2454 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2455 "--client_uuid <UUID> --conn_uuid <UUID>")
2456 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2458 elif config.cleanup:
2460 # the command line can override this value
2462 # ugly hack, only need to run lctl commands for --dump
2463 if config.lctl_dump or config.record:
2464 for_each_profile(node_db, prof_list, doCleanup)
2467 sys_set_timeout(timeout)
2468 sys_set_ptldebug(ptldebug)
2469 sys_set_subsystem(subsystem)
2470 sys_set_lustre_upcall(lustre_upcall)
2471 sys_set_portals_upcall(portals_upcall)
2473 for_each_profile(node_db, prof_list, doCleanup)
2474 for_each_profile(node_db, prof_list, doUnloadModules)
2477 # ugly hack, only need to run lctl commands for --dump
2478 if config.lctl_dump or config.record:
2479 sys_set_timeout(timeout)
2480 sys_set_lustre_upcall(lustre_upcall)
2481 for_each_profile(node_db, prof_list, doSetup)
2485 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2486 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2488 for_each_profile(node_db, prof_list, doModules)
2490 sys_set_debug_path()
2491 sys_set_ptldebug(ptldebug)
2492 sys_set_subsystem(subsystem)
2493 script = config.gdb_script
2494 run(lctl.lctl, ' modules >', script)
2496 log ("The GDB module script is in", script)
2497 # pause, so user has time to break and
2500 sys_set_timeout(timeout)
2501 sys_set_lustre_upcall(lustre_upcall)
2502 sys_set_portals_upcall(portals_upcall)
2504 for_each_profile(node_db, prof_list, doSetup)
2506 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2507 tgt = db.lookup(tgt_uuid)
2509 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2510 new_uuid = get_active_target(tgt)
2512 raise Lustre.LconfError("doRecovery: no active target found for: " +
2514 net = choose_local_server(get_ost_net(db, new_uuid))
2516 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2518 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2520 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2522 lctl.disconnect(oldnet)
2523 except CommandError, e:
2524 log("recover: disconnect", nid_uuid, "failed: ")
2529 except CommandError, e:
2530 log("recover: connect failed")
2533 lctl.recover(client_uuid, net.nid_uuid)
2536 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2537 base = os.path.dirname(cmd)
2538 if development_mode():
2539 if not config.lustre:
2540 config.lustre = (os.path.join(base, ".."))
2541 # normalize the portals dir, using command line arg if set
2543 portals_dir = config.portals
2544 dir = os.path.join(config.lustre, portals_dir)
2545 config.portals = dir
2546 debug('config.portals', config.portals)
2547 elif config.lustre and config.portals:
2549 # if --lustre and --portals, normalize portals
2550 # can ignore POTRALS_DIR here, since it is probly useless here
2551 config.portals = os.path.join(config.lustre, config.portals)
2552 debug('config.portals B', config.portals)
2554 def sysctl(path, val):
2555 debug("+ sysctl", path, val)
2559 fp = open(os.path.join('/proc/sys', path), 'w')
2566 def sys_set_debug_path():
2567 sysctl('portals/debug_path', config.debug_path)
2569 def sys_set_lustre_upcall(upcall):
2570 # the command overrides the value in the node config
2571 if config.lustre_upcall:
2572 upcall = config.lustre_upcall
2574 upcall = config.upcall
2576 lctl.set_lustre_upcall(upcall)
2578 def sys_set_portals_upcall(upcall):
2579 # the command overrides the value in the node config
2580 if config.portals_upcall:
2581 upcall = config.portals_upcall
2583 upcall = config.upcall
2585 sysctl('portals/upcall', upcall)
2587 def sys_set_timeout(timeout):
2588 # the command overrides the value in the node config
2589 if config.timeout and config.timeout > 0:
2590 timeout = config.timeout
2591 if timeout != None and timeout > 0:
2592 lctl.set_timeout(timeout)
2594 def sys_tweak_socknal ():
2595 if config.single_socket:
2596 sysctl("socknal/typed", 0)
2598 def sys_optimize_elan ():
2599 procfiles = ["/proc/elan/config/eventint_punt_loops",
2600 "/proc/qsnet/elan3/config/eventint_punt_loops",
2601 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2603 if os.access(p, os.R_OK):
2604 run ("echo 0 > " + p)
2606 def sys_set_ptldebug(ptldebug):
2608 ptldebug = config.ptldebug
2611 val = eval(ptldebug, ptldebug_names)
2612 val = "0x%x" % (val)
2613 sysctl('portals/debug', val)
2614 except NameError, e:
2617 def sys_set_subsystem(subsystem):
2618 if config.subsystem:
2619 subsystem = config.subsystem
2622 val = eval(subsystem, subsystem_names)
2623 val = "0x%x" % (val)
2624 sysctl('portals/subsystem_debug', val)
2625 except NameError, e:
2628 def sys_set_netmem_max(path, max):
2629 debug("setting", path, "to at least", max)
2637 fp = open(path, 'w')
2638 fp.write('%d\n' %(max))
2642 def sys_make_devices():
2643 if not os.access('/dev/portals', os.R_OK):
2644 run('mknod /dev/portals c 10 240')
2645 if not os.access('/dev/obd', os.R_OK):
2646 run('mknod /dev/obd c 10 241')
2649 # Add dir to the global PATH, if not already there.
2650 def add_to_path(new_dir):
2651 syspath = string.split(os.environ['PATH'], ':')
2652 if new_dir in syspath:
2654 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2656 def default_debug_path():
2657 path = '/tmp/lustre-log'
2658 if os.path.isdir('/r'):
2663 def default_gdb_script():
2664 script = '/tmp/ogdb'
2665 if os.path.isdir('/r'):
2666 return '/r' + script
2671 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2672 # ensure basic elements are in the system path
2673 def sanitise_path():
2674 for dir in DEFAULT_PATH:
2677 # global hack for the --select handling
2679 def init_select(args):
2680 # args = [service=nodeA,service2=nodeB service3=nodeC]
2683 list = string.split(arg, ',')
2685 srv, node = string.split(entry, '=')
2686 tgt_select[srv] = node
2688 def get_select(srv):
2689 if tgt_select.has_key(srv):
2690 return tgt_select[srv]
2694 FLAG = Lustre.Options.FLAG
2695 PARAM = Lustre.Options.PARAM
2696 INTPARAM = Lustre.Options.INTPARAM
2697 PARAMLIST = Lustre.Options.PARAMLIST
2699 ('verbose,v', "Print system commands as they are run"),
2700 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2701 ('config', "Cluster config name used for LDAP query", PARAM),
2702 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2703 ('node', "Load config for <nodename>", PARAM),
2704 ('cleanup,d', "Cleans up config. (Shutdown)"),
2705 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2707 ('single_socket', "socknal option: only use one socket instead of bundle",
2709 ('failover',"""Used to shut down without saving state.
2710 This will allow this node to "give up" a service to a
2711 another node for failover purposes. This will not
2712 be a clean shutdown.""",
2714 ('gdb', """Prints message after creating gdb module script
2715 and sleeps for 5 seconds."""),
2716 ('noexec,n', """Prints the commands and steps that will be run for a
2717 config without executing them. This can used to check if a
2718 config file is doing what it should be doing"""),
2719 ('nomod', "Skip load/unload module step."),
2720 ('nosetup', "Skip device setup/cleanup step."),
2721 ('reformat', "Reformat all devices (without question)"),
2722 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2723 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2725 ('write_conf', "Save all the client config information on mds."),
2726 ('record', "Write config information on mds."),
2727 ('record_log', "Name of config record log.", PARAM),
2728 ('record_device', "MDS device name that will record the config commands",
2730 ('minlevel', "Minimum level of services to configure/cleanup",
2732 ('maxlevel', """Maximum level of services to configure/cleanup
2733 Levels are aproximatly like:
2738 70 - mountpoint, echo_client, osc, mdc, lov""",
2740 ('lustre', """Base directory of lustre sources. This parameter will
2741 cause lconf to load modules from a source tree.""", PARAM),
2742 ('portals', """Portals source directory. If this is a relative path,
2743 then it is assumed to be relative to lustre. """, PARAM),
2744 ('timeout', "Set recovery timeout", INTPARAM),
2745 ('upcall', "Set both portals and lustre upcall script", PARAM),
2746 ('lustre_upcall', "Set lustre upcall script", PARAM),
2747 ('portals_upcall', "Set portals upcall script", PARAM),
2748 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2749 ('ptldebug', "Set the portals debug level", PARAM),
2750 ('subsystem', "Set the portals debug subsystem", PARAM),
2751 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2752 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2753 # Client recovery options
2754 ('recover', "Recover a device"),
2755 ('group', "The group of devices to configure or cleanup", PARAM),
2756 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2757 ('client_uuid', "The failed client (required for recovery)", PARAM),
2758 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2760 ('inactive', """The name of an inactive service, to be ignored during
2761 mounting (currently OST-only). Can be repeated.""",
2766 global lctl, config, toplevel, CONFIG_FILE
2768 # in the upcall this is set to SIG_IGN
2769 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2771 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2773 config, args = cl.parse(sys.argv[1:])
2774 except Lustre.OptionError, e:
2778 setupModulePath(sys.argv[0])
2780 host = socket.gethostname()
2782 # the PRNG is normally seeded with time(), which is not so good for starting
2783 # time-synchronized clusters
2784 input = open('/dev/urandom', 'r')
2786 print 'Unable to open /dev/urandom!'
2788 seed = input.read(32)
2794 init_select(config.select)
2797 if not os.access(args[0], os.R_OK):
2798 print 'File not found or readable:', args[0]
2801 dom = xml.dom.minidom.parse(args[0])
2803 panic("%s does not appear to be a config file." % (args[0]))
2804 sys.exit(1) # make sure to die here, even in debug mode.
2805 CONFIG_FILE = args[0]
2806 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2807 if not config.config:
2808 config.config = os.path.basename(args[0])# use full path?
2809 if config.config[-4:] == '.xml':
2810 config.config = config.config[:-4]
2811 elif config.ldapurl:
2812 if not config.config:
2813 panic("--ldapurl requires --config name")
2814 dn = "config=%s,fs=lustre" % (config.config)
2815 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2816 elif config.ptldebug or config.subsystem:
2817 sys_set_ptldebug(None)
2818 sys_set_subsystem(None)
2821 print 'Missing config file or ldap URL.'
2822 print 'see lconf --help for command summary'
2827 ver = db.get_version()
2829 panic("No version found in config data, please recreate.")
2830 if ver != Lustre.CONFIG_VERSION:
2831 panic("Config version", ver, "does not match lconf version",
2832 Lustre.CONFIG_VERSION)
2836 node_list.append(config.node)
2839 node_list.append(host)
2840 node_list.append('localhost')
2842 debug("configuring for host: ", node_list)
2845 config.debug_path = config.debug_path + '-' + host
2846 config.gdb_script = config.gdb_script + '-' + host
2848 lctl = LCTLInterface('lctl')
2850 if config.lctl_dump:
2851 lctl.use_save_file(config.lctl_dump)
2854 if not (config.record_device and config.record_log):
2855 panic("When recording, both --record_log and --record_device must be specified.")
2856 lctl.clear_log(config.record_device, config.record_log)
2857 lctl.record(config.record_device, config.record_log)
2859 doHost(db, node_list)
2864 if __name__ == "__main__":
2867 except Lustre.LconfError, e:
2869 # traceback.print_exc(file=sys.stdout)
2871 except CommandError, e:
2875 if first_cleanup_error:
2876 sys.exit(first_cleanup_error)