3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
360 # From "Python Cookbook" from O'Reilly
361 outfile = child.fromchild
362 outfd = outfile.fileno()
363 self.set_nonblock(outfd)
364 errfile = child.childerr
365 errfd = errfile.fileno()
366 self.set_nonblock(errfd)
368 outdata = errdata = ''
371 ready = select.select([outfd,errfd],[],[]) # Wait for input
372 if outfd in ready[0]:
373 outchunk = outfile.read()
374 if outchunk == '': outeof = 1
375 outdata = outdata + outchunk
376 if errfd in ready[0]:
377 errchunk = errfile.read()
378 if errchunk == '': erreof = 1
379 errdata = errdata + errchunk
380 if outeof and erreof: break
381 # end of "borrowed" code
384 if os.WIFEXITED(ret):
385 rc = os.WEXITSTATUS(ret)
388 if rc or len(errdata):
389 raise CommandError(self.lctl, errdata, rc)
392 def runcmd(self, *args):
394 run lctl using the command line
396 cmd = string.join(map(str,args))
397 debug("+", self.lctl, cmd)
398 rc, out = run(self.lctl, cmd)
400 raise CommandError(self.lctl, out, rc)
404 def clear_log(self, dev, log):
405 """ clear an existing log """
410 quit """ % (dev, log)
413 def network(self, net, nid):
418 quit """ % (net, nid)
421 # create a new connection
422 def add_uuid(self, net_type, uuid, nid):
423 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
426 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
428 if net_type in ('tcp',) and not config.lctl_dump:
433 add_autoconn %s %s %d %s
437 nid, hostaddr, port, flags )
440 def connect(self, srv):
441 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
442 if srv.net_type in ('tcp',) and not config.lctl_dump:
446 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
447 srv.nid, srv.hostaddr, srv.port, flags)
450 def recover(self, dev_name, new_conn):
453 recover %s""" %(dev_name, new_conn)
456 # add a route to a range
457 def add_route(self, net, gw, lo, hi):
465 except CommandError, e:
469 def del_route(self, net, gw, lo, hi):
474 quit """ % (net, gw, lo, hi)
477 # add a route to a host
478 def add_route_host(self, net, uuid, gw, tgt):
479 self.add_uuid(net, uuid, tgt)
487 except CommandError, e:
491 # add a route to a range
492 def del_route_host(self, net, uuid, gw, tgt):
498 quit """ % (net, gw, tgt)
502 def del_autoconn(self, net_type, nid, hostaddr):
503 if net_type in ('tcp',) and not config.lctl_dump:
512 # disconnect one connection
513 def disconnect(self, srv):
514 self.del_uuid(srv.nid_uuid)
515 if srv.net_type in ('tcp',) and not config.lctl_dump:
516 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
518 def del_uuid(self, uuid):
526 def disconnectAll(self, net):
534 def attach(self, type, name, uuid):
537 quit""" % (type, name, uuid)
540 def setup(self, name, setup = ""):
544 quit""" % (name, setup)
548 # create a new device with lctl
549 def newdev(self, type, name, uuid, setup = ""):
550 self.attach(type, name, uuid);
552 self.setup(name, setup)
553 except CommandError, e:
554 self.cleanup(name, uuid, 0)
559 def cleanup(self, name, uuid, force, failover = 0):
560 if failover: force = 1
566 quit""" % (name, ('', 'force')[force],
567 ('', 'failover')[failover])
571 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
572 stripe_sz, stripe_off,
576 lov_setup %s %d %d %d %s %s
577 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
586 lov_setconfig %s %d %d %d %s %s
587 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
591 def dump(self, dump_file):
594 quit""" % (dump_file)
597 # get list of devices
598 def device_list(self):
599 devices = '/proc/fs/lustre/devices'
601 if os.access(devices, os.R_OK):
603 fp = open(devices, 'r')
611 def lustre_version(self):
612 rc, out = self.runcmd('version')
616 def mount_option(self, profile, osc, mdc):
618 mount_option %s %s %s
619 quit""" % (profile, osc, mdc)
622 # delete mount options
623 def del_mount_option(self, profile):
629 def set_timeout(self, timeout):
635 # delete mount options
636 def set_lustre_upcall(self, upcall):
641 # ============================================================
642 # Various system-level functions
643 # (ideally moved to their own module)
645 # Run a command and return the output and status.
646 # stderr is sent to /dev/null, could use popen3 to
647 # save it if necessary
650 if config.noexec: return (0, [])
651 f = os.popen(cmd + ' 2>&1')
661 cmd = string.join(map(str,args))
664 # Run a command in the background.
665 def run_daemon(*args):
666 cmd = string.join(map(str,args))
668 if config.noexec: return 0
669 f = os.popen(cmd + ' 2>&1')
677 # Determine full path to use for an external command
678 # searches dirname(argv[0]) first, then PATH
680 syspath = string.split(os.environ['PATH'], ':')
681 cmdpath = os.path.dirname(sys.argv[0])
682 syspath.insert(0, cmdpath);
684 syspath.insert(0, os.path.join(config.portals, 'utils/'))
686 prog = os.path.join(d,cmd)
687 if os.access(prog, os.X_OK):
691 # Recursively look for file starting at base dir
692 def do_find_file(base, mod):
693 fullname = os.path.join(base, mod)
694 if os.access(fullname, os.R_OK):
696 for d in os.listdir(base):
697 dir = os.path.join(base,d)
698 if os.path.isdir(dir):
699 module = do_find_file(dir, mod)
703 def find_module(src_dir, dev_dir, modname):
704 modbase = src_dir +'/'+ dev_dir +'/'+ modname
705 for modext in '.ko', '.o':
706 module = modbase + modext
708 if os.access(module, os.R_OK):
714 # is the path a block device?
721 return stat.S_ISBLK(s[stat.ST_MODE])
723 # build fs according to type
725 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
731 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
733 # devsize is in 1k, and fs block count is in 4k
734 block_cnt = devsize/4
736 if fstype in ('ext3', 'extN', 'ldiskfs'):
737 # ext3 journal size is in megabytes
740 if not is_block(dev):
741 ret, out = runcmd("ls -l %s" %dev)
742 devsize = int(string.split(out[0])[4]) / 1024
744 ret, out = runcmd("sfdisk -s %s" %dev)
745 devsize = int(out[0])
746 if devsize > 1024 * 1024:
747 jsize = ((devsize / 102400) * 4)
750 if jsize: jopt = "-J size=%d" %(jsize,)
751 if isize: iopt = "-I %d" %(isize,)
752 mkfs = 'mkfs.ext2 -j -b 4096 '
753 if not isblock or config.force:
755 elif fstype == 'reiserfs':
756 # reiserfs journal size is in blocks
757 if jsize: jopt = "--journal_size %d" %(jsize,)
758 mkfs = 'mkreiserfs -ff'
760 panic('unsupported fs type: ', fstype)
762 if config.mkfsoptions != None:
763 mkfs = mkfs + ' ' + config.mkfsoptions
764 if mkfsoptions != None:
765 mkfs = mkfs + ' ' + mkfsoptions
766 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
768 panic("Unable to build fs:", dev, string.join(out))
769 # enable hash tree indexing on fsswe
770 if fstype in ('ext3', 'extN', 'ldiskfs'):
771 htree = 'echo "feature FEATURE_C5" | debugfs -w'
772 (ret, out) = run (htree, dev)
774 panic("Unable to enable htree:", dev)
776 # some systems use /dev/loopN, some /dev/loop/N
780 if not os.access(loop + str(0), os.R_OK):
782 if not os.access(loop + str(0), os.R_OK):
783 panic ("can't access loop devices")
786 # find loop device assigned to the file
787 def find_assigned_loop(file):
789 for n in xrange(0, MAX_LOOP_DEVICES):
791 if os.access(dev, os.R_OK):
792 (stat, out) = run('losetup', dev)
793 if out and stat == 0:
794 m = re.search(r'\((.*)\)', out[0])
795 if m and file == m.group(1):
801 # create file if necessary and assign the first free loop device
802 def init_loop(file, size, fstype, journal_size, inode_size,
803 mkfsoptions, reformat, backfstype, backfile):
806 realfstype = backfstype
811 dev = find_assigned_loop(realfile)
813 print 'WARNING file:', realfile, 'already mapped to', dev
816 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
818 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
819 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
821 panic("Unable to create backing store:", realfile)
823 mkfs(realfile, size, realfstype, journal_size, inode_size,
824 mkfsoptions, isblock=0)
827 # find next free loop
828 for n in xrange(0, MAX_LOOP_DEVICES):
830 if os.access(dev, os.R_OK):
831 (stat, out) = run('losetup', dev)
833 run('losetup', dev, realfile)
836 print "out of loop devices"
838 print "out of loop devices"
841 # undo loop assignment
842 def clean_loop(file):
843 dev = find_assigned_loop(file)
845 ret, out = run('losetup -d', dev)
847 log('unable to clean loop device:', dev, 'for file:', file)
850 # determine if dev is formatted as a <fstype> filesystem
851 def need_format(fstype, dev):
852 # FIXME don't know how to implement this
855 # initialize a block device if needed
856 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
857 inode_size, mkfsoptions, backfstype, backdev):
861 if fstype == 'smfs' or not is_block(dev):
862 dev = init_loop(dev, size, fstype, journal_size, inode_size,
863 mkfsoptions, reformat, backfstype, backdev)
864 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
865 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
868 # panic("device:", dev,
869 # "not prepared, and autoformat is not set.\n",
870 # "Rerun with --reformat option to format ALL filesystems")
875 """lookup IP address for an interface"""
876 rc, out = run("/sbin/ifconfig", iface)
879 addr = string.split(out[1])[1]
880 ip = string.split(addr, ':')[1]
883 def def_mount_options(fstype, target):
884 """returns deafult mount options for passed fstype and target (mds, ost)"""
885 if fstype == 'ext3' or fstype == 'ldiskfs':
886 mountfsoptions = "errors=remount-ro"
887 if target == 'ost' and sys_get_branch() == '2.4':
888 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
889 return mountfsoptions
892 def sys_get_elan_position_file():
893 procfiles = ["/proc/elan/device0/position",
894 "/proc/qsnet/elan4/device0/position",
895 "/proc/qsnet/elan3/device0/position"]
897 if os.access(p, os.R_OK):
901 def sys_get_local_nid(net_type, wildcard, cluster_id):
902 """Return the local nid."""
904 if sys_get_elan_position_file():
905 local = sys_get_local_address('elan', '*', cluster_id)
907 local = sys_get_local_address(net_type, wildcard, cluster_id)
910 def sys_get_local_address(net_type, wildcard, cluster_id):
911 """Return the local address for the network type."""
913 if net_type in ('tcp',):
915 iface, star = string.split(wildcard, ':')
916 local = if2addr(iface)
918 panic ("unable to determine ip for:", wildcard)
920 host = socket.gethostname()
921 local = socket.gethostbyname(host)
922 elif net_type == 'elan':
923 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
924 f = sys_get_elan_position_file()
926 panic ("unable to determine local Elan ID")
929 lines = fp.readlines()
937 nid = my_int(cluster_id) + my_int(elan_id)
939 except ValueError, e:
943 elif net_type == 'gm':
944 fixme("automatic local address for GM")
948 def sys_get_branch():
949 """Returns kernel release"""
951 fp = open('/proc/sys/kernel/osrelease')
952 lines = fp.readlines()
956 version = string.split(l)
957 a = string.split(version[0], '.')
958 return a[0] + '.' + a[1]
964 def mod_loaded(modname):
965 """Check if a module is already loaded. Look in /proc/modules for it."""
967 fp = open('/proc/modules')
968 lines = fp.readlines()
970 # please forgive my tired fingers for this one
971 ret = filter(lambda word, mod=modname: word == mod,
972 map(lambda line: string.split(line)[0], lines))
977 # XXX: instead of device_list, ask for $name and see what we get
978 def is_prepared(name):
979 """Return true if a device exists for the name"""
982 if (config.noexec or config.record) and config.cleanup:
985 # expect this format:
986 # 1 UP ldlm ldlm ldlm_UUID 2
987 out = lctl.device_list()
989 if name == string.split(s)[3]:
991 except CommandError, e:
995 def is_network_prepared():
996 """If the any device exists, then assume that all networking
997 has been configured"""
998 out = lctl.device_list()
1001 def fs_is_mounted(path):
1002 """Return true if path is a mounted lustre filesystem"""
1004 fp = open('/proc/mounts')
1005 lines = fp.readlines()
1009 if a[1] == path and a[2] == 'lustre_lite':
1017 """Manage kernel modules"""
1018 def __init__(self, lustre_dir, portals_dir):
1019 self.lustre_dir = lustre_dir
1020 self.portals_dir = portals_dir
1021 self.kmodule_list = []
1023 def add_portals_module(self, dev_dir, modname):
1024 """Append a module to list of modules to load."""
1025 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1027 def add_lustre_module(self, dev_dir, modname):
1028 """Append a module to list of modules to load."""
1029 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1031 def load_module(self):
1032 """Load all the modules in the list in the order they appear."""
1033 for src_dir, dev_dir, mod in self.kmodule_list:
1034 if mod_loaded(mod) and not config.noexec:
1036 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1038 module = find_module(src_dir, dev_dir, mod)
1040 panic('module not found:', mod)
1041 (rc, out) = run('/sbin/insmod', module)
1043 raise CommandError('insmod', out, rc)
1045 (rc, out) = run('/sbin/modprobe', mod)
1047 raise CommandError('modprobe', out, rc)
1049 def cleanup_module(self):
1050 """Unload the modules in the list in reverse order."""
1051 rev = self.kmodule_list
1053 for src_dir, dev_dir, mod in rev:
1054 if not mod_loaded(mod) and not config.noexec:
1057 if mod == 'portals' and config.dump:
1058 lctl.dump(config.dump)
1059 log('unloading module:', mod)
1060 (rc, out) = run('/sbin/rmmod', mod)
1062 log('! unable to unload module:', mod)
1065 # ============================================================
1066 # Classes to prepare and cleanup the various objects
1069 """ Base class for the rest of the modules. The default cleanup method is
1070 defined here, as well as some utilitiy funcs.
1072 def __init__(self, module_name, db):
1074 self.module_name = module_name
1075 self.name = self.db.getName()
1076 self.uuid = self.db.getUUID()
1079 self.kmod = kmod(config.lustre, config.portals)
1081 def info(self, *args):
1082 msg = string.join(map(str,args))
1083 print self.module_name + ":", self.name, self.uuid, msg
1086 """ default cleanup, used for most modules """
1089 lctl.cleanup(self.name, self.uuid, config.force)
1090 except CommandError, e:
1091 log(self.module_name, "cleanup failed: ", self.name)
1095 def add_portals_module(self, dev_dir, modname):
1096 """Append a module to list of modules to load."""
1097 self.kmod.add_portals_module(dev_dir, modname)
1099 def add_lustre_module(self, dev_dir, modname):
1100 """Append a module to list of modules to load."""
1101 self.kmod.add_lustre_module(dev_dir, modname)
1103 def load_module(self):
1104 """Load all the modules in the list in the order they appear."""
1105 self.kmod.load_module()
1107 def cleanup_module(self):
1108 """Unload the modules in the list in reverse order."""
1109 if self.safe_to_clean():
1110 self.kmod.cleanup_module()
1112 def safe_to_clean(self):
1115 def safe_to_clean_modules(self):
1116 return self.safe_to_clean()
1118 class Network(Module):
1119 def __init__(self,db):
1120 Module.__init__(self, 'NETWORK', db)
1121 self.net_type = self.db.get_val('nettype')
1122 self.nid = self.db.get_val('nid', '*')
1123 self.cluster_id = self.db.get_val('clusterid', "0")
1124 self.port = self.db.get_val_int('port', 0)
1125 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1126 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1127 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1130 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1132 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1133 self.generic_nid = 1
1134 debug("nid:", self.nid)
1136 self.generic_nid = 0
1138 self.nid_uuid = self.nid_to_uuid(self.nid)
1140 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1141 if '*' in self.hostaddr:
1142 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1143 if not self.hostaddr:
1144 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1145 debug("hostaddr:", self.hostaddr)
1147 self.add_portals_module("libcfs", 'libcfs')
1148 self.add_portals_module("portals", 'portals')
1149 if node_needs_router():
1150 self.add_portals_module("router", 'kptlrouter')
1151 if self.net_type == 'tcp':
1152 self.add_portals_module("knals/socknal", 'ksocknal')
1153 if self.net_type == 'elan':
1154 self.add_portals_module("knals/qswnal", 'kqswnal')
1155 if self.net_type == 'gm':
1156 self.add_portals_module("knals/gmnal", 'kgmnal')
1158 def nid_to_uuid(self, nid):
1159 return "NID_%s_UUID" %(nid,)
1162 if is_network_prepared():
1164 self.info(self.net_type, self.nid, self.port)
1165 if not (config.record and self.generic_nid):
1166 lctl.network(self.net_type, self.nid)
1167 if self.net_type == 'tcp':
1169 if self.net_type == 'elan':
1171 if self.port and node_is_router():
1172 run_one_acceptor(self.port)
1173 self.connect_peer_gateways()
1175 def connect_peer_gateways(self):
1176 for router in self.db.lookup_class('node'):
1177 if router.get_val_int('router', 0):
1178 for netuuid in router.get_networks():
1179 net = self.db.lookup(netuuid)
1181 if (gw.cluster_id == self.cluster_id and
1182 gw.net_type == self.net_type):
1183 if gw.nid != self.nid:
1186 def disconnect_peer_gateways(self):
1187 for router in self.db.lookup_class('node'):
1188 if router.get_val_int('router', 0):
1189 for netuuid in router.get_networks():
1190 net = self.db.lookup(netuuid)
1192 if (gw.cluster_id == self.cluster_id and
1193 gw.net_type == self.net_type):
1194 if gw.nid != self.nid:
1197 except CommandError, e:
1198 print "disconnect failed: ", self.name
1202 def safe_to_clean(self):
1203 return not is_network_prepared()
1206 self.info(self.net_type, self.nid, self.port)
1208 stop_acceptor(self.port)
1209 if node_is_router():
1210 self.disconnect_peer_gateways()
1212 class RouteTable(Module):
1213 def __init__(self,db):
1214 Module.__init__(self, 'ROUTES', db)
1216 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1218 # only setup connections for tcp NALs
1220 if not net_type in ('tcp',):
1223 # connect to target if route is to single node and this node is the gw
1224 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1225 if not local_cluster(net_type, tgt_cluster_id):
1226 panic("target", lo, " not on the local cluster")
1227 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1228 # connect to gateway if this node is not the gw
1229 elif (local_cluster(net_type, gw_cluster_id)
1230 and not local_interface(net_type, gw_cluster_id, gw)):
1231 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1236 panic("no server for nid", lo)
1239 return Network(srvdb)
1242 if is_network_prepared():
1245 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1246 lctl.add_route(net_type, gw, lo, hi)
1247 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1251 def safe_to_clean(self):
1252 return not is_network_prepared()
1255 if is_network_prepared():
1256 # the network is still being used, don't clean it up
1258 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1259 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1262 lctl.disconnect(srv)
1263 except CommandError, e:
1264 print "disconnect failed: ", self.name
1269 lctl.del_route(net_type, gw, lo, hi)
1270 except CommandError, e:
1271 print "del_route failed: ", self.name
1275 class Management(Module):
1276 def __init__(self, db):
1277 Module.__init__(self, 'MGMT', db)
1278 self.add_lustre_module('lvfs', 'lvfs')
1279 self.add_lustre_module('obdclass', 'obdclass')
1280 self.add_lustre_module('ptlrpc', 'ptlrpc')
1281 self.add_lustre_module('mgmt', 'mgmt_svc')
1284 if is_prepared(self.name):
1287 lctl.newdev("mgmt", self.name, self.uuid)
1289 def safe_to_clean(self):
1293 if is_prepared(self.name):
1294 Module.cleanup(self)
1296 # This is only needed to load the modules; the LDLM device
1297 # is now created automatically.
1299 def __init__(self,db):
1300 Module.__init__(self, 'LDLM', db)
1301 self.add_lustre_module('lvfs', 'lvfs')
1302 self.add_lustre_module('obdclass', 'obdclass')
1303 self.add_lustre_module('ptlrpc', 'ptlrpc')
1312 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1313 Module.__init__(self, 'LOV', db)
1314 if name_override != None:
1315 self.name = "lov_%s" % name_override
1316 self.add_lustre_module('lov', 'lov')
1317 self.mds_uuid = self.db.get_first_ref('mds')
1318 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1319 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1320 self.pattern = self.db.get_val_int('stripepattern', 0)
1321 self.devlist = self.db.get_refs('obd')
1322 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1324 self.desc_uuid = self.uuid
1325 self.uuid = generate_client_uuid(self.name)
1326 self.fs_name = fs_name
1328 self.config_only = 1
1330 self.config_only = None
1331 mds= self.db.lookup(self.mds_uuid)
1332 self.mds_name = mds.getName()
1333 for obd_uuid in self.devlist:
1334 obd = self.db.lookup(obd_uuid)
1335 osc = get_osc(obd, self.uuid, fs_name)
1337 self.osclist.append(osc)
1339 panic('osc not found:', obd_uuid)
1342 if is_prepared(self.name):
1344 if self.config_only:
1345 panic("Can't prepare config_only LOV ", self.name)
1347 for osc in self.osclist:
1349 # Only ignore connect failures with --force, which
1350 # isn't implemented here yet.
1351 osc.prepare(ignore_connect_failure=0)
1352 except CommandError, e:
1353 print "Error preparing OSC %s\n" % osc.uuid
1355 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1356 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1357 lctl.lov_setup(self.name, self.uuid,
1358 self.desc_uuid, self.mds_name, self.stripe_cnt,
1359 self.stripe_sz, self.stripe_off, self.pattern,
1360 string.join(self.devlist))
1363 if is_prepared(self.name):
1364 Module.cleanup(self)
1365 if self.config_only:
1366 panic("Can't clean up config_only LOV ", self.name)
1367 for osc in self.osclist:
1370 def load_module(self):
1371 if self.config_only:
1372 panic("Can't load modules for config_only LOV ", self.name)
1373 for osc in self.osclist:
1376 Module.load_module(self)
1378 def cleanup_module(self):
1379 if self.config_only:
1380 panic("Can't cleanup modules for config_only LOV ", self.name)
1381 Module.cleanup_module(self)
1382 for osc in self.osclist:
1383 osc.cleanup_module()
1386 class MDSDEV(Module):
1387 def __init__(self,db):
1388 Module.__init__(self, 'MDSDEV', db)
1389 self.devpath = self.db.get_val('devpath','')
1390 self.backdevpath = self.db.get_val('backdevpath','')
1391 self.size = self.db.get_val_int('devsize', 0)
1392 self.journal_size = self.db.get_val_int('journalsize', 0)
1393 self.fstype = self.db.get_val('fstype', '')
1394 self.backfstype = self.db.get_val('backfstype', '')
1395 self.nspath = self.db.get_val('nspath', '')
1396 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1397 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1398 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1399 target_uuid = self.db.get_first_ref('target')
1400 mds = self.db.lookup(target_uuid)
1401 self.name = mds.getName()
1402 self.filesystem_uuids = mds.get_refs('filesystem')
1403 # FIXME: if fstype not set, then determine based on kernel version
1404 self.format = self.db.get_val('autoformat', "no")
1405 if mds.get_val('failover', 0):
1406 self.failover_mds = 'f'
1408 self.failover_mds = 'n'
1409 active_uuid = get_active_target(mds)
1411 panic("No target device found:", target_uuid)
1412 if active_uuid == self.uuid:
1416 if self.active and config.group and config.group != mds.get_val('group'):
1419 self.inode_size = self.db.get_val_int('inodesize', 0)
1420 if self.inode_size == 0:
1421 # find the LOV for this MDS
1422 lovconfig_uuid = mds.get_first_ref('lovconfig')
1423 if not lovconfig_uuid:
1424 panic("No LOV config found for MDS ", mds.name)
1425 lovconfig = mds.lookup(lovconfig_uuid)
1426 lov_uuid = lovconfig.get_first_ref('lov')
1428 panic("No LOV found for lovconfig ", lovconfig.name)
1429 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1431 # default stripe count controls default inode_size
1432 stripe_count = lov.stripe_cnt
1433 if stripe_count > 77:
1434 self.inode_size = 4096
1435 elif stripe_count > 35:
1436 self.inode_size = 2048
1437 elif stripe_count > 13:
1438 self.inode_size = 1024
1439 elif stripe_count > 3:
1440 self.inode_size = 512
1442 self.inode_size = 256
1444 self.target_dev_uuid = self.uuid
1445 self.uuid = target_uuid
1448 self.add_lustre_module('mdc', 'mdc')
1449 self.add_lustre_module('osc', 'osc')
1450 self.add_lustre_module('lov', 'lov')
1451 self.add_lustre_module('mds', 'mds')
1453 if self.fstype == 'smfs':
1454 self.add_lustre_module('smfs', 'smfs')
1456 if self.fstype == 'ldiskfs':
1457 self.add_lustre_module('ldiskfs', 'ldiskfs')
1460 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1462 # if fstype is smfs, then we should also take care about backing
1464 if self.fstype == 'smfs':
1465 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1467 def load_module(self):
1469 Module.load_module(self)
1472 if is_prepared(self.name):
1475 debug(self.uuid, "not active")
1478 # run write_conf automatically, if --reformat used
1480 self.info(self.devpath, self.fstype, self.size, self.format)
1482 # never reformat here
1483 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1484 self.format, self.journal_size, self.inode_size,
1485 self.mkfsoptions, self.backfstype, self.backdevpath)
1487 if not is_prepared('MDT'):
1488 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1490 mountfsoptions = def_mount_options(self.fstype, 'mds')
1492 if config.mountfsoptions:
1494 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1496 mountfsoptions = config.mountfsoptions
1497 if self.mountfsoptions:
1498 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1500 if self.mountfsoptions:
1502 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1504 mountfsoptions = self.mountfsoptions
1506 if self.fstype == 'smfs':
1507 realdev = self.fstype
1510 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1514 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1519 print 'MDS mount options: ' + mountfsoptions
1521 lctl.newdev("mds", self.name, self.uuid,
1522 setup ="%s %s %s %s" %(realdev, self.fstype,
1523 self.name, mountfsoptions))
1524 except CommandError, e:
1526 panic("MDS is missing the config log. Need to run " +
1527 "lconf --write_conf.")
1531 def write_conf(self):
1532 if is_prepared(self.name):
1534 self.info(self.devpath, self.fstype, self.format)
1536 blkdev = block_dev(self.devpath, self.size, self.fstype,
1537 config.reformat, self.format, self.journal_size,
1538 self.inode_size, self.mkfsoptions, self.backfstype,
1541 # Even for writing logs we mount mds with supplied mount options
1542 # because it will not mount smfs (if used) otherwise.
1544 mountfsoptions = def_mount_options(self.fstype, 'mds')
1546 if config.mountfsoptions:
1548 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1550 mountfsoptions = config.mountfsoptions
1551 if self.mountfsoptions:
1552 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1554 if self.mountfsoptions:
1556 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1558 mountfsoptions = self.mountfsoptions
1560 if self.fstype == 'smfs':
1561 realdev = self.fstype
1564 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1568 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1573 print 'MDS mount options: ' + mountfsoptions
1575 # As mount options are passed by 4th param to config tool, we need
1576 # to pass something in 3rd param. But we do not want this 3rd param
1577 # be counted as a profile name for reading log on MDS setup, thus,
1578 # we pass there some predefined sign like 'dumb', which will be
1579 # checked in MDS code and skipped. Probably there is more nice way
1580 # like pass empty string and check it in config tool and pass null
1582 lctl.newdev("mds", self.name, self.uuid,
1583 setup ="%s %s %s %s" %(realdev, self.fstype,
1584 'dumb', mountfsoptions))
1585 # record logs for the MDS lov
1586 for uuid in self.filesystem_uuids:
1587 log("recording clients for filesystem:", uuid)
1588 fs = self.db.lookup(uuid)
1589 obd_uuid = fs.get_first_ref('obd')
1590 client_uuid = generate_client_uuid(self.name)
1591 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1594 lctl.clear_log(self.name, self.name)
1595 lctl.record(self.name, self.name)
1597 lctl.mount_option(self.name, client.get_name(), "")
1601 lctl.clear_log(self.name, self.name + '-clean')
1602 lctl.record(self.name, self.name + '-clean')
1604 lctl.del_mount_option(self.name)
1609 # record logs for each client
1611 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1613 config_options = CONFIG_FILE
1615 for node_db in self.db.lookup_class('node'):
1616 client_name = node_db.getName()
1617 for prof_uuid in node_db.get_refs('profile'):
1618 prof_db = node_db.lookup(prof_uuid)
1619 # refactor this into a funtion to test "clientness"
1621 for ref_class, ref_uuid in prof_db.get_all_refs():
1622 if ref_class in ('mountpoint','echoclient'):
1623 debug("recording", client_name)
1624 old_noexec = config.noexec
1626 noexec_opt = ('', '-n')
1627 ret, out = run (sys.argv[0],
1628 noexec_opt[old_noexec == 1],
1629 " -v --record --nomod",
1630 "--record_log", client_name,
1631 "--record_device", self.name,
1632 "--node", client_name,
1635 for s in out: log("record> ", string.strip(s))
1636 ret, out = run (sys.argv[0],
1637 noexec_opt[old_noexec == 1],
1638 "--cleanup -v --record --nomod",
1639 "--record_log", client_name + "-clean",
1640 "--record_device", self.name,
1641 "--node", client_name,
1644 for s in out: log("record> ", string.strip(s))
1645 config.noexec = old_noexec
1647 lctl.cleanup(self.name, self.uuid, 0, 0)
1648 except CommandError, e:
1649 log(self.module_name, "cleanup failed: ", self.name)
1652 Module.cleanup(self)
1654 if self.fstype == 'smfs':
1655 clean_loop(self.backdevpath)
1657 clean_loop(self.devpath)
1659 def msd_remaining(self):
1660 out = lctl.device_list()
1662 if string.split(s)[2] in ('mds',):
1665 def safe_to_clean(self):
1668 def safe_to_clean_modules(self):
1669 return not self.msd_remaining()
1673 debug(self.uuid, "not active")
1676 if is_prepared(self.name):
1678 lctl.cleanup(self.name, self.uuid, config.force,
1680 except CommandError, e:
1681 log(self.module_name, "cleanup failed: ", self.name)
1684 Module.cleanup(self)
1685 if not self.msd_remaining() and is_prepared('MDT'):
1687 lctl.cleanup("MDT", "MDT_UUID", config.force,
1689 except CommandError, e:
1690 print "cleanup failed: ", self.name
1694 if self.fstype == 'smfs':
1695 clean_loop(self.backdevpath)
1697 clean_loop(self.devpath)
1700 def __init__(self, db):
1701 Module.__init__(self, 'OSD', db)
1702 self.osdtype = self.db.get_val('osdtype')
1703 self.devpath = self.db.get_val('devpath', '')
1704 self.backdevpath = self.db.get_val('backdevpath', '')
1705 self.size = self.db.get_val_int('devsize', 0)
1706 self.journal_size = self.db.get_val_int('journalsize', 0)
1707 self.inode_size = self.db.get_val_int('inodesize', 0)
1708 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1709 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1710 self.fstype = self.db.get_val('fstype', '')
1711 self.backfstype = self.db.get_val('backfstype', '')
1712 self.nspath = self.db.get_val('nspath', '')
1713 target_uuid = self.db.get_first_ref('target')
1714 ost = self.db.lookup(target_uuid)
1715 self.name = ost.getName()
1716 self.format = self.db.get_val('autoformat', 'yes')
1717 if ost.get_val('failover', 0):
1718 self.failover_ost = 'f'
1720 self.failover_ost = 'n'
1722 active_uuid = get_active_target(ost)
1724 panic("No target device found:", target_uuid)
1725 if active_uuid == self.uuid:
1729 if self.active and config.group and config.group != ost.get_val('group'):
1732 self.target_dev_uuid = self.uuid
1733 self.uuid = target_uuid
1735 self.add_lustre_module('ost', 'ost')
1736 if self.fstype == 'smfs':
1737 self.add_lustre_module('smfs', 'smfs')
1738 # FIXME: should we default to ext3 here?
1739 if self.fstype == 'ldiskfs':
1740 self.add_lustre_module('ldiskfs', 'ldiskfs')
1742 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1743 if self.fstype == 'smfs':
1744 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1746 self.add_lustre_module(self.osdtype, self.osdtype)
1748 def load_module(self):
1750 Module.load_module(self)
1752 # need to check /proc/mounts and /etc/mtab before
1753 # formatting anything.
1754 # FIXME: check if device is already formatted.
1756 if is_prepared(self.name):
1759 debug(self.uuid, "not active")
1761 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1762 self.format, self.journal_size, self.inode_size)
1764 if self.osdtype == 'obdecho':
1767 blkdev = block_dev(self.devpath, self.size, self.fstype,
1768 config.reformat, self.format, self.journal_size,
1769 self.inode_size, self.mkfsoptions, self.backfstype,
1772 mountfsoptions = def_mount_options(self.fstype, 'ost')
1774 if config.mountfsoptions:
1776 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1778 mountfsoptions = config.mountfsoptions
1779 if self.mountfsoptions:
1780 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1782 if self.mountfsoptions:
1784 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1786 mountfsoptions = self.mountfsoptions
1788 if self.fstype == 'smfs':
1789 realdev = self.fstype
1792 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1796 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1801 print 'OSD mount options: ' + mountfsoptions
1803 lctl.newdev(self.osdtype, self.name, self.uuid,
1804 setup ="%s %s %s %s" %(realdev, self.fstype,
1807 if not is_prepared('OSS'):
1808 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1810 def osd_remaining(self):
1811 out = lctl.device_list()
1813 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1816 def safe_to_clean(self):
1819 def safe_to_clean_modules(self):
1820 return not self.osd_remaining()
1824 debug(self.uuid, "not active")
1826 if is_prepared(self.name):
1829 lctl.cleanup(self.name, self.uuid, config.force,
1831 except CommandError, e:
1832 log(self.module_name, "cleanup failed: ", self.name)
1835 if not self.osd_remaining() and is_prepared('OSS'):
1837 lctl.cleanup("OSS", "OSS_UUID", config.force,
1839 except CommandError, e:
1840 print "cleanup failed: ", self.name
1843 if not self.osdtype == 'obdecho':
1844 if self.fstype == 'smfs':
1845 clean_loop(self.backdevpath)
1847 clean_loop(self.devpath)
1849 def mgmt_uuid_for_fs(mtpt_name):
1852 mtpt_db = toplustreDB.lookup_name(mtpt_name)
1853 fs_uuid = mtpt_db.get_first_ref('filesystem')
1854 fs = toplustreDB.lookup(fs_uuid)
1857 return fs.get_first_ref('mgmt')
1859 # Generic client module, used by OSC and MDC
1860 class Client(Module):
1861 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1863 self.target_name = tgtdb.getName()
1864 self.target_uuid = tgtdb.getUUID()
1867 self.tgt_dev_uuid = get_active_target(tgtdb)
1868 if not self.tgt_dev_uuid:
1869 panic("No target device found for target:", self.target_name)
1871 self.kmod = kmod(config.lustre, config.portals)
1875 self.module = module
1876 self.module_name = string.upper(module)
1878 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1879 self.target_name, fs_name)
1881 self.name = self_name
1883 self.lookup_server(self.tgt_dev_uuid)
1884 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1886 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1889 self.fs_name = fs_name
1892 self.add_lustre_module(module_dir, module)
1894 def lookup_server(self, srv_uuid):
1895 """ Lookup a server's network information """
1896 self._server_nets = get_ost_net(self.db, srv_uuid)
1897 if len(self._server_nets) == 0:
1898 panic ("Unable to find a server for:", srv_uuid)
1900 def get_servers(self):
1901 return self._server_nets
1903 def prepare(self, ignore_connect_failure = 0):
1904 self.info(self.target_uuid)
1905 if is_prepared(self.name):
1908 srv = choose_local_server(self.get_servers())
1912 routes = find_route(self.get_servers())
1913 if len(routes) == 0:
1914 panic ("no route to", self.target_uuid)
1915 for (srv, r) in routes:
1916 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1917 except CommandError, e:
1918 if not ignore_connect_failure:
1921 if self.target_uuid in config.inactive and self.permits_inactive():
1922 debug("%s inactive" % self.target_uuid)
1923 inactive_p = "inactive"
1925 debug("%s active" % self.target_uuid)
1927 lctl.newdev(self.module, self.name, self.uuid,
1928 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1929 inactive_p, self.mgmt_name))
1932 if is_prepared(self.name):
1933 Module.cleanup(self)
1935 srv = choose_local_server(self.get_servers())
1937 lctl.disconnect(srv)
1939 for (srv, r) in find_route(self.get_servers()):
1940 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1941 except CommandError, e:
1942 log(self.module_name, "cleanup failed: ", self.name)
1948 def __init__(self, db, uuid, fs_name):
1949 Client.__init__(self, db, uuid, 'mdc', fs_name)
1951 def permits_inactive(self):
1955 def __init__(self, db, uuid, fs_name):
1956 Client.__init__(self, db, uuid, 'osc', fs_name)
1958 def permits_inactive(self):
1961 def mgmtcli_name_for_uuid(uuid):
1962 return 'MGMTCLI_%s' % uuid
1964 class ManagementClient(Client):
1965 def __init__(self, db, uuid):
1966 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1967 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1968 module_dir = 'mgmt')
1971 def __init__(self, db):
1972 Module.__init__(self, 'COBD', db)
1973 self.real_uuid = self.db.get_first_ref('realobd')
1974 self.cache_uuid = self.db.get_first_ref('cacheobd')
1975 self.add_lustre_module('cobd' , 'cobd')
1977 # need to check /proc/mounts and /etc/mtab before
1978 # formatting anything.
1979 # FIXME: check if device is already formatted.
1981 if is_prepared(self.name):
1983 self.info(self.real_uuid, self.cache_uuid)
1984 lctl.newdev("cobd", self.name, self.uuid,
1985 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1988 # virtual interface for OSC and LOV
1990 def __init__(self, db, uuid, fs_name, name_override = None):
1991 Module.__init__(self, 'VOSC', db)
1992 if db.get_class() == 'lov':
1993 self.osc = LOV(db, uuid, fs_name, name_override)
1995 self.osc = get_osc(db, uuid, fs_name)
1997 return self.osc.uuid
1999 return self.osc.name
2004 def load_module(self):
2005 self.osc.load_module()
2006 def cleanup_module(self):
2007 self.osc.cleanup_module()
2010 class ECHO_CLIENT(Module):
2011 def __init__(self,db):
2012 Module.__init__(self, 'ECHO_CLIENT', db)
2013 self.add_lustre_module('obdecho', 'obdecho')
2014 self.obd_uuid = self.db.get_first_ref('obd')
2015 obd = self.db.lookup(self.obd_uuid)
2016 self.uuid = generate_client_uuid(self.name)
2017 self.osc = VOSC(obd, self.uuid, self.name)
2020 if is_prepared(self.name):
2023 self.osc.prepare() # XXX This is so cheating. -p
2024 self.info(self.obd_uuid)
2026 lctl.newdev("echo_client", self.name, self.uuid,
2027 setup = self.osc.get_name())
2030 if is_prepared(self.name):
2031 Module.cleanup(self)
2034 def load_module(self):
2035 self.osc.load_module()
2036 Module.load_module(self)
2038 def cleanup_module(self):
2039 Module.cleanup_module(self)
2040 self.osc.cleanup_module()
2043 def generate_client_uuid(name):
2044 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2046 int(random.random() * 1048576),
2047 int(random.random() * 1048576))
2048 return client_uuid[:36]
2051 class Mountpoint(Module):
2052 def __init__(self,db):
2053 Module.__init__(self, 'MTPT', db)
2054 self.path = self.db.get_val('path')
2055 self.fs_uuid = self.db.get_first_ref('filesystem')
2056 fs = self.db.lookup(self.fs_uuid)
2057 self.mds_uuid = fs.get_first_ref('mds')
2058 self.obd_uuid = fs.get_first_ref('obd')
2059 self.mgmt_uuid = fs.get_first_ref('mgmt')
2060 obd = self.db.lookup(self.obd_uuid)
2061 client_uuid = generate_client_uuid(self.name)
2062 self.vosc = VOSC(obd, client_uuid, self.name)
2063 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
2065 self.add_lustre_module('mdc', 'mdc')
2066 self.add_lustre_module('llite', 'llite')
2068 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2074 if fs_is_mounted(self.path):
2075 log(self.path, "already mounted.")
2079 self.mgmtcli.prepare()
2082 mdc_name = self.mdc.name
2084 self.info(self.path, self.mds_uuid, self.obd_uuid)
2085 if config.record or config.lctl_dump:
2086 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
2088 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2089 (self.vosc.get_name(), mdc_name, config.config, self.path)
2090 run("mkdir", self.path)
2095 panic("mount failed:", self.path, ":", string.join(val))
2098 self.info(self.path, self.mds_uuid,self.obd_uuid)
2100 if config.record or config.lctl_dump:
2101 lctl.del_mount_option(local_node_name)
2103 if fs_is_mounted(self.path):
2105 (rc, out) = run("umount", "-f", self.path)
2107 (rc, out) = run("umount", self.path)
2109 raise CommandError('umount', out, rc)
2111 if fs_is_mounted(self.path):
2112 panic("fs is still mounted:", self.path)
2117 self.mgmtcli.cleanup()
2119 def load_module(self):
2121 self.mgmtcli.load_module()
2122 self.vosc.load_module()
2123 Module.load_module(self)
2125 def cleanup_module(self):
2126 Module.cleanup_module(self)
2127 self.vosc.cleanup_module()
2129 self.mgmtcli.cleanup_module()
2132 # ============================================================
2133 # misc query functions
2135 def get_ost_net(self, osd_uuid):
2139 osd = self.lookup(osd_uuid)
2140 node_uuid = osd.get_first_ref('node')
2141 node = self.lookup(node_uuid)
2143 panic("unable to find node for osd_uuid:", osd_uuid,
2144 " node_ref:", node_uuid)
2145 for net_uuid in node.get_networks():
2146 db = node.lookup(net_uuid)
2147 srv_list.append(Network(db))
2151 # the order of iniitailization is based on level.
2152 def getServiceLevel(self):
2153 type = self.get_class()
2155 if type in ('network',):
2157 elif type in ('routetbl',):
2159 elif type in ('ldlm',):
2161 elif type in ('mgmt',):
2163 elif type in ('osd', 'cobd'):
2165 elif type in ('mdsdev',):
2167 elif type in ('mountpoint', 'echoclient'):
2170 panic("Unknown type: ", type)
2172 if ret < config.minlevel or ret > config.maxlevel:
2177 # return list of services in a profile. list is a list of tuples
2178 # [(level, db_object),]
2179 def getServices(self):
2181 for ref_class, ref_uuid in self.get_all_refs():
2182 servdb = self.lookup(ref_uuid)
2184 level = getServiceLevel(servdb)
2186 list.append((level, servdb))
2188 panic('service not found: ' + ref_uuid)
2194 ############################################################
2196 # FIXME: clean this mess up!
2198 # OSC is no longer in the xml, so we have to fake it.
2199 # this is getting ugly and begging for another refactoring
2200 def get_osc(ost_db, uuid, fs_name):
2201 osc = OSC(ost_db, uuid, fs_name)
2204 def get_mdc(db, uuid, fs_name, mds_uuid):
2205 mds_db = db.lookup(mds_uuid);
2207 panic("no mds:", mds_uuid)
2208 mdc = MDC(mds_db, uuid, fs_name)
2211 ############################################################
2212 # routing ("rooting")
2214 # list of (nettype, cluster_id, nid)
2217 def find_local_clusters(node_db):
2218 global local_clusters
2219 for netuuid in node_db.get_networks():
2220 net = node_db.lookup(netuuid)
2222 debug("add_local", netuuid)
2223 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2225 if acceptors.has_key(srv.port):
2226 panic("duplicate port:", srv.port)
2227 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2228 srv.send_mem, srv.recv_mem,
2231 # This node is a gateway.
2233 def node_is_router():
2236 # If there are any routers found in the config, then this will be true
2237 # and all nodes will load kptlrouter.
2239 def node_needs_router():
2240 return needs_router or is_router
2242 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2243 # Currently, these local routes are only added to kptlrouter route
2244 # table if they are needed to connect to a specific server. This
2245 # should be changed so all available routes are loaded, and the
2246 # ptlrouter can make all the decisions.
2249 def find_local_routes(lustre):
2250 """ Scan the lustre config looking for routers . Build list of
2252 global local_routes, needs_router
2254 list = lustre.lookup_class('node')
2256 if router.get_val_int('router', 0):
2258 for (local_type, local_cluster_id, local_nid) in local_clusters:
2260 for netuuid in router.get_networks():
2261 db = router.lookup(netuuid)
2262 if (local_type == db.get_val('nettype') and
2263 local_cluster_id == db.get_val('clusterid')):
2264 gw = db.get_val('nid')
2267 debug("find_local_routes: gw is", gw)
2268 for route in router.get_local_routes(local_type, gw):
2269 local_routes.append(route)
2270 debug("find_local_routes:", local_routes)
2273 def choose_local_server(srv_list):
2274 for srv in srv_list:
2275 if local_cluster(srv.net_type, srv.cluster_id):
2278 def local_cluster(net_type, cluster_id):
2279 for cluster in local_clusters:
2280 if net_type == cluster[0] and cluster_id == cluster[1]:
2284 def local_interface(net_type, cluster_id, nid):
2285 for cluster in local_clusters:
2286 if (net_type == cluster[0] and cluster_id == cluster[1]
2287 and nid == cluster[2]):
2291 def find_route(srv_list):
2293 frm_type = local_clusters[0][0]
2294 for srv in srv_list:
2295 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2296 to_type = srv.net_type
2298 cluster_id = srv.cluster_id
2299 debug ('looking for route to', to_type, to)
2300 for r in local_routes:
2301 debug("find_route: ", r)
2302 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2303 result.append((srv, r))
2306 def get_active_target(db):
2307 target_uuid = db.getUUID()
2308 target_name = db.getName()
2309 node_name = get_select(target_name)
2311 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2313 tgt_dev_uuid = db.get_first_ref('active')
2316 def get_server_by_nid_uuid(db, nid_uuid):
2317 for n in db.lookup_class("network"):
2319 if net.nid_uuid == nid_uuid:
2323 ############################################################
2327 type = db.get_class()
2328 debug('Service:', type, db.getName(), db.getUUID())
2333 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2334 elif type == 'network':
2336 elif type == 'routetbl':
2340 elif type == 'cobd':
2342 elif type == 'mdsdev':
2344 elif type == 'mountpoint':
2346 elif type == 'echoclient':
2348 elif type == 'mgmt':
2351 panic ("unknown service type:", type)
2355 # Prepare the system to run lustre using a particular profile
2356 # in a the configuration.
2357 # * load & the modules
2358 # * setup networking for the current node
2359 # * make sure partitions are in place and prepared
2360 # * initialize devices with lctl
2361 # Levels is important, and needs to be enforced.
2362 def for_each_profile(db, prof_list, operation):
2363 for prof_uuid in prof_list:
2364 prof_db = db.lookup(prof_uuid)
2366 panic("profile:", profile, "not found.")
2367 services = getServices(prof_db)
2370 def doWriteconf(services):
2374 if s[1].get_class() == 'mdsdev':
2375 n = newService(s[1])
2378 def doSetup(services):
2382 n = newService(s[1])
2385 def doModules(services):
2389 n = newService(s[1])
2392 def doCleanup(services):
2397 n = newService(s[1])
2398 if n.safe_to_clean():
2401 def doUnloadModules(services):
2406 n = newService(s[1])
2407 if n.safe_to_clean_modules():
2412 def doHost(lustreDB, hosts):
2413 global is_router, local_node_name
2416 node_db = lustreDB.lookup_name(h, 'node')
2420 panic('No host entry found.')
2422 local_node_name = node_db.get_val('name', 0)
2423 is_router = node_db.get_val_int('router', 0)
2424 lustre_upcall = node_db.get_val('lustreUpcall', '')
2425 portals_upcall = node_db.get_val('portalsUpcall', '')
2426 timeout = node_db.get_val_int('timeout', 0)
2427 ptldebug = node_db.get_val('ptldebug', '')
2428 subsystem = node_db.get_val('subsystem', '')
2430 find_local_clusters(node_db)
2432 find_local_routes(lustreDB)
2434 # Two step process: (1) load modules, (2) setup lustre
2435 # if not cleaning, load modules first.
2436 prof_list = node_db.get_refs('profile')
2438 if config.write_conf:
2440 for_each_profile(node_db, prof_list, doModules)
2442 for_each_profile(node_db, prof_list, doWriteconf)
2443 for_each_profile(node_db, prof_list, doUnloadModules)
2445 elif config.recover:
2446 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2447 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2448 "--client_uuid <UUID> --conn_uuid <UUID>")
2449 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2451 elif config.cleanup:
2453 # the command line can override this value
2455 # ugly hack, only need to run lctl commands for --dump
2456 if config.lctl_dump or config.record:
2457 for_each_profile(node_db, prof_list, doCleanup)
2460 sys_set_timeout(timeout)
2461 sys_set_ptldebug(ptldebug)
2462 sys_set_subsystem(subsystem)
2463 sys_set_lustre_upcall(lustre_upcall)
2464 sys_set_portals_upcall(portals_upcall)
2466 for_each_profile(node_db, prof_list, doCleanup)
2467 for_each_profile(node_db, prof_list, doUnloadModules)
2471 # ugly hack, only need to run lctl commands for --dump
2472 if config.lctl_dump or config.record:
2473 sys_set_timeout(timeout)
2474 sys_set_lustre_upcall(lustre_upcall)
2475 for_each_profile(node_db, prof_list, doSetup)
2479 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2480 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2482 for_each_profile(node_db, prof_list, doModules)
2484 sys_set_debug_path()
2485 sys_set_ptldebug(ptldebug)
2486 sys_set_subsystem(subsystem)
2487 script = config.gdb_script
2488 run(lctl.lctl, ' modules >', script)
2490 log ("The GDB module script is in", script)
2491 # pause, so user has time to break and
2494 sys_set_timeout(timeout)
2495 sys_set_lustre_upcall(lustre_upcall)
2496 sys_set_portals_upcall(portals_upcall)
2498 for_each_profile(node_db, prof_list, doSetup)
2501 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2502 tgt = lustreDB.lookup(tgt_uuid)
2504 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2505 new_uuid = get_active_target(tgt)
2507 raise Lustre.LconfError("doRecovery: no active target found for: " +
2509 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2511 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2513 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2515 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2518 lctl.disconnect(oldnet)
2519 except CommandError, e:
2520 log("recover: disconnect", nid_uuid, "failed: ")
2525 except CommandError, e:
2526 log("recover: connect failed")
2529 lctl.recover(client_uuid, net.nid_uuid)
2532 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2533 base = os.path.dirname(cmd)
2534 if development_mode():
2535 if not config.lustre:
2536 debug('using objdir module paths')
2537 config.lustre = (os.path.join(base, ".."))
2538 # normalize the portals dir, using command line arg if set
2540 portals_dir = config.portals
2541 dir = os.path.join(config.lustre, portals_dir)
2542 config.portals = dir
2543 debug('config.portals', config.portals)
2544 elif config.lustre and config.portals:
2546 # if --lustre and --portals, normalize portals
2547 # can ignore POTRALS_DIR here, since it is probly useless here
2548 config.portals = os.path.join(config.lustre, config.portals)
2549 debug('config.portals B', config.portals)
2551 def sysctl(path, val):
2552 debug("+ sysctl", path, val)
2556 fp = open(os.path.join('/proc/sys', path), 'w')
2563 def sys_set_debug_path():
2564 sysctl('portals/debug_path', config.debug_path)
2566 def sys_set_lustre_upcall(upcall):
2567 # the command overrides the value in the node config
2568 if config.lustre_upcall:
2569 upcall = config.lustre_upcall
2571 upcall = config.upcall
2573 lctl.set_lustre_upcall(upcall)
2575 def sys_set_portals_upcall(upcall):
2576 # the command overrides the value in the node config
2577 if config.portals_upcall:
2578 upcall = config.portals_upcall
2580 upcall = config.upcall
2582 sysctl('portals/upcall', upcall)
2584 def sys_set_timeout(timeout):
2585 # the command overrides the value in the node config
2586 if config.timeout and config.timeout > 0:
2587 timeout = config.timeout
2588 if timeout != None and timeout > 0:
2589 lctl.set_timeout(timeout)
2591 def sys_tweak_socknal ():
2592 if config.single_socket:
2593 sysctl("socknal/typed", 0)
2595 def sys_optimize_elan ():
2596 procfiles = ["/proc/elan/config/eventint_punt_loops",
2597 "/proc/qsnet/elan3/config/eventint_punt_loops",
2598 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2600 if os.access(p, os.R_OK):
2601 run ("echo 0 > " + p)
2603 def sys_set_ptldebug(ptldebug):
2605 ptldebug = config.ptldebug
2608 val = eval(ptldebug, ptldebug_names)
2609 val = "0x%x" % (val)
2610 sysctl('portals/debug', val)
2611 except NameError, e:
2614 def sys_set_subsystem(subsystem):
2615 if config.subsystem:
2616 subsystem = config.subsystem
2619 val = eval(subsystem, subsystem_names)
2620 val = "0x%x" % (val)
2621 sysctl('portals/subsystem_debug', val)
2622 except NameError, e:
2625 def sys_set_netmem_max(path, max):
2626 debug("setting", path, "to at least", max)
2634 fp = open(path, 'w')
2635 fp.write('%d\n' %(max))
2639 def sys_make_devices():
2640 if not os.access('/dev/portals', os.R_OK):
2641 run('mknod /dev/portals c 10 240')
2642 if not os.access('/dev/obd', os.R_OK):
2643 run('mknod /dev/obd c 10 241')
2646 # Add dir to the global PATH, if not already there.
2647 def add_to_path(new_dir):
2648 syspath = string.split(os.environ['PATH'], ':')
2649 if new_dir in syspath:
2651 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2653 def default_debug_path():
2654 path = '/tmp/lustre-log'
2655 if os.path.isdir('/r'):
2660 def default_gdb_script():
2661 script = '/tmp/ogdb'
2662 if os.path.isdir('/r'):
2663 return '/r' + script
2668 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2669 # ensure basic elements are in the system path
2670 def sanitise_path():
2671 for dir in DEFAULT_PATH:
2674 # global hack for the --select handling
2676 def init_select(args):
2677 # args = [service=nodeA,service2=nodeB service3=nodeC]
2680 list = string.split(arg, ',')
2682 srv, node = string.split(entry, '=')
2683 tgt_select[srv] = node
2685 def get_select(srv):
2686 if tgt_select.has_key(srv):
2687 return tgt_select[srv]
2691 FLAG = Lustre.Options.FLAG
2692 PARAM = Lustre.Options.PARAM
2693 INTPARAM = Lustre.Options.INTPARAM
2694 PARAMLIST = Lustre.Options.PARAMLIST
2696 ('verbose,v', "Print system commands as they are run"),
2697 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2698 ('config', "Cluster config name used for LDAP query", PARAM),
2699 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2700 ('node', "Load config for <nodename>", PARAM),
2701 ('cleanup,d', "Cleans up config. (Shutdown)"),
2702 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2704 ('single_socket', "socknal option: only use one socket instead of bundle",
2706 ('failover',"""Used to shut down without saving state.
2707 This will allow this node to "give up" a service to a
2708 another node for failover purposes. This will not
2709 be a clean shutdown.""",
2711 ('gdb', """Prints message after creating gdb module script
2712 and sleeps for 5 seconds."""),
2713 ('noexec,n', """Prints the commands and steps that will be run for a
2714 config without executing them. This can used to check if a
2715 config file is doing what it should be doing"""),
2716 ('nomod', "Skip load/unload module step."),
2717 ('nosetup', "Skip device setup/cleanup step."),
2718 ('reformat', "Reformat all devices (without question)"),
2719 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2720 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2721 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2723 ('write_conf', "Save all the client config information on mds."),
2724 ('record', "Write config information on mds."),
2725 ('record_log', "Name of config record log.", PARAM),
2726 ('record_device', "MDS device name that will record the config commands",
2728 ('minlevel', "Minimum level of services to configure/cleanup",
2730 ('maxlevel', """Maximum level of services to configure/cleanup
2731 Levels are aproximatly like:
2736 70 - mountpoint, echo_client, osc, mdc, lov""",
2738 ('lustre', """Base directory of lustre sources. This parameter will
2739 cause lconf to load modules from a source tree.""", PARAM),
2740 ('portals', """Portals source directory. If this is a relative path,
2741 then it is assumed to be relative to lustre. """, PARAM),
2742 ('timeout', "Set recovery timeout", INTPARAM),
2743 ('upcall', "Set both portals and lustre upcall script", PARAM),
2744 ('lustre_upcall', "Set lustre upcall script", PARAM),
2745 ('portals_upcall', "Set portals upcall script", PARAM),
2746 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2747 ('ptldebug', "Set the portals debug level", PARAM),
2748 ('subsystem', "Set the portals debug subsystem", PARAM),
2749 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2750 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2751 # Client recovery options
2752 ('recover', "Recover a device"),
2753 ('group', "The group of devices to configure or cleanup", PARAM),
2754 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2755 ('client_uuid', "The failed client (required for recovery)", PARAM),
2756 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2758 ('inactive', """The name of an inactive service, to be ignored during
2759 mounting (currently OST-only). Can be repeated.""",
2764 global lctl, config, toplustreDB, CONFIG_FILE
2766 # in the upcall this is set to SIG_IGN
2767 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2769 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2771 config, args = cl.parse(sys.argv[1:])
2772 except Lustre.OptionError, e:
2776 setupModulePath(sys.argv[0])
2778 host = socket.gethostname()
2780 # the PRNG is normally seeded with time(), which is not so good for starting
2781 # time-synchronized clusters
2782 input = open('/dev/urandom', 'r')
2784 print 'Unable to open /dev/urandom!'
2786 seed = input.read(32)
2792 init_select(config.select)
2795 # allow config to be fetched via HTTP, but only with python2
2796 if sys.version[0] != '1' and args[0].startswith('http://'):
2799 config_file = urllib2.urlopen(args[0])
2800 except (urllib2.URLError, socket.error), err:
2801 if hasattr(err, 'args'):
2803 print "Could not access '%s': %s" %(args[0], err)
2805 elif not os.access(args[0], os.R_OK):
2806 print 'File not found or readable:', args[0]
2810 config_file = open(args[0], 'r')
2812 dom = xml.dom.minidom.parse(config_file)
2814 panic("%s does not appear to be a config file." % (args[0]))
2815 sys.exit(1) # make sure to die here, even in debug mode.
2817 CONFIG_FILE = args[0]
2818 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2819 if not config.config:
2820 config.config = os.path.basename(args[0])# use full path?
2821 if config.config[-4:] == '.xml':
2822 config.config = config.config[:-4]
2823 elif config.ldapurl:
2824 if not config.config:
2825 panic("--ldapurl requires --config name")
2826 dn = "config=%s,fs=lustre" % (config.config)
2827 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2828 elif config.ptldebug or config.subsystem:
2829 sys_set_ptldebug(None)
2830 sys_set_subsystem(None)
2833 print 'Missing config file or ldap URL.'
2834 print 'see lconf --help for command summary'
2837 toplustreDB = lustreDB
2839 ver = lustreDB.get_version()
2841 panic("No version found in config data, please recreate.")
2842 if ver != Lustre.CONFIG_VERSION:
2843 panic("Config version", ver, "does not match lconf version",
2844 Lustre.CONFIG_VERSION)
2848 node_list.append(config.node)
2851 node_list.append(host)
2852 node_list.append('localhost')
2854 debug("configuring for host: ", node_list)
2857 config.debug_path = config.debug_path + '-' + host
2858 config.gdb_script = config.gdb_script + '-' + host
2860 lctl = LCTLInterface('lctl')
2862 if config.lctl_dump:
2863 lctl.use_save_file(config.lctl_dump)
2866 if not (config.record_device and config.record_log):
2867 panic("When recording, both --record_log and --record_device must be specified.")
2868 lctl.clear_log(config.record_device, config.record_log)
2869 lctl.record(config.record_device, config.record_log)
2871 doHost(lustreDB, node_list)
2876 if __name__ == "__main__":
2879 except Lustre.LconfError, e:
2881 # traceback.print_exc(file=sys.stdout)
2883 except CommandError, e:
2887 if first_cleanup_error:
2888 sys.exit(first_cleanup_error)