3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
106 "socknal" : (1 << 11),
107 "qswnal" : (1 << 12),
108 "pinger" : (1 << 13),
109 "filter" : (1 << 14),
115 "ptlrouter" : (1 << 20),
122 first_cleanup_error = 0
123 def cleanup_error(rc):
124 global first_cleanup_error
125 if not first_cleanup_error:
126 first_cleanup_error = rc
128 # ============================================================
129 # debugging and error funcs
131 def fixme(msg = "this feature"):
132 raise Lustre.LconfError, msg + ' not implemented yet.'
135 msg = string.join(map(str,args))
136 if not config.noexec:
137 raise Lustre.LconfError(msg)
142 msg = string.join(map(str,args))
147 print string.strip(s)
151 msg = string.join(map(str,args))
154 # ack, python's builtin int() does not support '0x123' syntax.
155 # eval can do it, although what a hack!
159 return eval(s, {}, {})
162 except SyntaxError, e:
163 raise ValueError("not a number")
165 raise ValueError("not a number")
167 # ============================================================
168 # locally defined exceptions
169 class CommandError (exceptions.Exception):
170 def __init__(self, cmd_name, cmd_err, rc=None):
171 self.cmd_name = cmd_name
172 self.cmd_err = cmd_err
177 if type(self.cmd_err) == types.StringType:
179 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
181 print "! %s: %s" % (self.cmd_name, self.cmd_err)
182 elif type(self.cmd_err) == types.ListType:
184 print "! %s (error %d):" % (self.cmd_name, self.rc)
186 print "! %s:" % (self.cmd_name)
187 for s in self.cmd_err:
188 print "> %s" %(string.strip(s))
193 # ============================================================
194 # handle daemons, like the acceptor
196 """ Manage starting and stopping a daemon. Assumes daemon manages
197 it's own pid file. """
199 def __init__(self, cmd):
205 log(self.command, "already running.")
207 self.path = find_prog(self.command)
209 panic(self.command, "not found.")
210 ret, out = runcmd(self.path +' '+ self.command_line())
212 raise CommandError(self.path, out, ret)
216 pid = self.read_pidfile()
218 log ("killing process", pid)
220 #time.sleep(1) # let daemon die
222 log("unable to kill", self.command, e)
224 log("unable to kill", self.command)
227 pid = self.read_pidfile()
237 def read_pidfile(self):
239 fp = open(self.pidfile(), 'r')
246 def clean_pidfile(self):
247 """ Remove a stale pidfile """
248 log("removing stale pidfile:", self.pidfile())
250 os.unlink(self.pidfile())
252 log(self.pidfile(), e)
254 class AcceptorHandler(DaemonHandler):
255 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
256 DaemonHandler.__init__(self, "acceptor")
259 self.send_mem = send_mem
260 self.recv_mem = recv_mem
263 self.flags = self.flags + ' -i'
266 return "/var/run/%s-%d.pid" % (self.command, self.port)
268 def command_line(self):
269 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
273 # start the acceptors
275 if config.lctl_dump or config.record:
277 for port in acceptors.keys():
278 daemon = acceptors[port]
279 if not daemon.running():
282 def run_one_acceptor(port):
283 if config.lctl_dump or config.record:
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
287 if not daemon.running():
290 panic("run_one_acceptor: No acceptor defined for port:", port)
292 def stop_acceptor(port):
293 if acceptors.has_key(port):
294 daemon = acceptors[port]
299 # ============================================================
300 # handle lctl interface
303 Manage communication with lctl
306 def __init__(self, cmd):
308 Initialize close by finding the lctl binary.
310 self.lctl = find_prog(cmd)
312 self.record_device = ''
315 debug('! lctl not found')
318 raise CommandError('lctl', "unable to find lctl binary.")
320 def use_save_file(self, file):
321 self.save_file = file
323 def record(self, dev_name, logname):
324 log("Recording log", logname, "on", dev_name)
325 self.record_device = dev_name
326 self.record_log = logname
328 def end_record(self):
329 log("End recording log", self.record_log, "on", self.record_device)
330 self.record_device = None
331 self.record_log = None
333 def set_nonblock(self, fd):
334 fl = fcntl.fcntl(fd, F_GETFL)
335 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
340 the cmds are written to stdin of lctl
341 lctl doesn't return errors when run in script mode, so
343 should modify command line to accept multiple commands, or
344 create complex command line options
348 cmds = '\n dump ' + self.save_file + '\n' + cmds
349 elif self.record_device:
353 %s""" % (self.record_device, self.record_log, cmds)
355 debug("+", cmd_line, cmds)
356 if config.noexec: return (0, [])
358 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
359 child.tochild.write(cmds + "\n")
360 child.tochild.close()
361 # print "LCTL:", cmds
363 # From "Python Cookbook" from O'Reilly
364 outfile = child.fromchild
365 outfd = outfile.fileno()
366 self.set_nonblock(outfd)
367 errfile = child.childerr
368 errfd = errfile.fileno()
369 self.set_nonblock(errfd)
371 outdata = errdata = ''
374 ready = select.select([outfd,errfd],[],[]) # Wait for input
375 if outfd in ready[0]:
376 outchunk = outfile.read()
377 if outchunk == '': outeof = 1
378 outdata = outdata + outchunk
379 if errfd in ready[0]:
380 errchunk = errfile.read()
381 if errchunk == '': erreof = 1
382 errdata = errdata + errchunk
383 if outeof and erreof: break
384 # end of "borrowed" code
387 if os.WIFEXITED(ret):
388 rc = os.WEXITSTATUS(ret)
391 if rc or len(errdata):
392 raise CommandError(self.lctl, errdata, rc)
395 def runcmd(self, *args):
397 run lctl using the command line
399 cmd = string.join(map(str,args))
400 debug("+", self.lctl, cmd)
401 rc, out = run(self.lctl, cmd)
403 raise CommandError(self.lctl, out, rc)
407 def clear_log(self, dev, log):
408 """ clear an existing log """
413 quit """ % (dev, log)
416 def network(self, net, nid):
421 quit """ % (net, nid)
424 def root_squash(self, name, uid, nid):
428 quit""" % (name, uid, nid)
431 # create a new connection
432 def add_uuid(self, net_type, uuid, nid):
433 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
436 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
438 if net_type in ('tcp',) and not config.lctl_dump:
443 add_autoconn %s %s %d %s
447 nid, hostaddr, port, flags )
450 def connect(self, srv):
451 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
452 if srv.net_type in ('tcp',) and not config.lctl_dump:
456 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
457 srv.nid, srv.hostaddr, srv.port, flags)
460 def recover(self, dev_name, new_conn):
463 recover %s""" %(dev_name, new_conn)
466 # add a route to a range
467 def add_route(self, net, gw, lo, hi):
475 except CommandError, e:
479 def del_route(self, net, gw, lo, hi):
484 quit """ % (net, gw, lo, hi)
487 # add a route to a host
488 def add_route_host(self, net, uuid, gw, tgt):
489 self.add_uuid(net, uuid, tgt)
497 except CommandError, e:
501 # add a route to a range
502 def del_route_host(self, net, uuid, gw, tgt):
508 quit """ % (net, gw, tgt)
512 def del_autoconn(self, net_type, nid, hostaddr):
513 if net_type in ('tcp',) and not config.lctl_dump:
522 # disconnect one connection
523 def disconnect(self, srv):
524 self.del_uuid(srv.nid_uuid)
525 if srv.net_type in ('tcp',) and not config.lctl_dump:
526 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
528 def del_uuid(self, uuid):
536 def disconnectAll(self, net):
544 def attach(self, type, name, uuid):
547 quit""" % (type, name, uuid)
550 def setup(self, name, setup = ""):
554 quit""" % (name, setup)
558 # create a new device with lctl
559 def newdev(self, type, name, uuid, setup = ""):
560 self.attach(type, name, uuid);
562 self.setup(name, setup)
563 except CommandError, e:
564 self.cleanup(name, uuid, 0)
569 def cleanup(self, name, uuid, force, failover = 0):
570 if failover: force = 1
576 quit""" % (name, ('', 'force')[force],
577 ('', 'failover')[failover])
581 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
582 stripe_sz, stripe_off, pattern):
585 lov_setup %s %d %d %d %s
586 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
589 # add an OBD to a LOV
590 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
592 lov_modify_tgts add %s %s %s %s
593 quit""" % (name, obd_uuid, index, gen)
597 def lmv_setup(self, name, uuid, desc_uuid, devlist):
601 quit""" % (name, uuid, desc_uuid, devlist)
604 # delete an OBD from a LOV
605 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
607 lov_modify_tgts del %s %s %s %s
608 quit""" % (name, obd_uuid, index, gen)
612 def deactivate(self, name):
620 def dump(self, dump_file):
623 quit""" % (dump_file)
626 # get list of devices
627 def device_list(self):
628 devices = '/proc/fs/lustre/devices'
630 if os.access(devices, os.R_OK):
632 fp = open(devices, 'r')
640 def lustre_version(self):
641 rc, out = self.runcmd('version')
645 def mount_option(self, profile, osc, mdc):
647 mount_option %s %s %s
648 quit""" % (profile, osc, mdc)
651 # delete mount options
652 def del_mount_option(self, profile):
658 def set_timeout(self, timeout):
664 def set_lustre_upcall(self, upcall):
669 # ============================================================
670 # Various system-level functions
671 # (ideally moved to their own module)
673 # Run a command and return the output and status.
674 # stderr is sent to /dev/null, could use popen3 to
675 # save it if necessary
678 if config.noexec: return (0, [])
679 f = os.popen(cmd + ' 2>&1')
689 cmd = string.join(map(str,args))
692 # Run a command in the background.
693 def run_daemon(*args):
694 cmd = string.join(map(str,args))
696 if config.noexec: return 0
697 f = os.popen(cmd + ' 2>&1')
705 # Determine full path to use for an external command
706 # searches dirname(argv[0]) first, then PATH
708 syspath = string.split(os.environ['PATH'], ':')
709 cmdpath = os.path.dirname(sys.argv[0])
710 syspath.insert(0, cmdpath);
712 syspath.insert(0, os.path.join(config.portals, 'utils/'))
714 prog = os.path.join(d,cmd)
715 if os.access(prog, os.X_OK):
719 # Recursively look for file starting at base dir
720 def do_find_file(base, mod):
721 fullname = os.path.join(base, mod)
722 if os.access(fullname, os.R_OK):
724 for d in os.listdir(base):
725 dir = os.path.join(base,d)
726 if os.path.isdir(dir):
727 module = do_find_file(dir, mod)
731 def find_module(src_dir, dev_dir, modname):
732 modbase = src_dir +'/'+ dev_dir +'/'+ modname
733 for modext in '.ko', '.o':
734 module = modbase + modext
736 if os.access(module, os.R_OK):
742 # is the path a block device?
749 return stat.S_ISBLK(s[stat.ST_MODE])
751 # build fs according to type
753 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
759 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
761 # devsize is in 1k, and fs block count is in 4k
762 block_cnt = devsize/4
764 if fstype in ('ext3', 'extN', 'ldiskfs'):
765 # ext3 journal size is in megabytes
768 if not is_block(dev):
769 ret, out = runcmd("ls -l %s" %dev)
770 devsize = int(string.split(out[0])[4]) / 1024
772 ret, out = runcmd("sfdisk -s %s" %dev)
773 devsize = int(out[0])
774 if devsize > 1024 * 1024:
775 jsize = ((devsize / 102400) * 4)
778 if jsize: jopt = "-J size=%d" %(jsize,)
779 if isize: iopt = "-I %d" %(isize,)
780 mkfs = 'mkfs.ext2 -j -b 4096 '
781 if not isblock or config.force:
783 elif fstype == 'reiserfs':
784 # reiserfs journal size is in blocks
785 if jsize: jopt = "--journal_size %d" %(jsize,)
786 mkfs = 'mkreiserfs -ff'
788 panic('unsupported fs type: ', fstype)
790 if config.mkfsoptions != None:
791 mkfs = mkfs + ' ' + config.mkfsoptions
792 if mkfsoptions != None:
793 mkfs = mkfs + ' ' + mkfsoptions
794 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
796 panic("Unable to build fs:", dev, string.join(out))
797 # enable hash tree indexing on fsswe
798 if fstype in ('ext3', 'extN', 'ldiskfs'):
799 htree = 'echo "feature FEATURE_C5" | debugfs -w'
800 (ret, out) = run (htree, dev)
802 panic("Unable to enable htree:", dev)
804 # some systems use /dev/loopN, some /dev/loop/N
808 if not os.access(loop + str(0), os.R_OK):
810 if not os.access(loop + str(0), os.R_OK):
811 panic ("can't access loop devices")
814 # find loop device assigned to the file
815 def find_assigned_loop(file):
817 for n in xrange(0, MAX_LOOP_DEVICES):
819 if os.access(dev, os.R_OK):
820 (stat, out) = run('losetup', dev)
821 if out and stat == 0:
822 m = re.search(r'\((.*)\)', out[0])
823 if m and file == m.group(1):
829 # create file if necessary and assign the first free loop device
830 def init_loop(file, size, fstype, journal_size, inode_size,
831 mkfsoptions, reformat, autoformat, backfstype, backfile):
834 realfstype = backfstype
835 if is_block(backfile):
836 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
837 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
843 dev = find_assigned_loop(realfile)
845 print 'WARNING file:', realfile, 'already mapped to', dev
848 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
850 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
851 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
853 panic("Unable to create backing store:", realfile)
855 mkfs(realfile, size, realfstype, journal_size, inode_size,
856 mkfsoptions, isblock=0)
859 # find next free loop
860 for n in xrange(0, MAX_LOOP_DEVICES):
862 if os.access(dev, os.R_OK):
863 (stat, out) = run('losetup', dev)
865 run('losetup', dev, realfile)
868 print "out of loop devices"
870 print "out of loop devices"
873 # undo loop assignment
874 def clean_loop(file):
875 dev = find_assigned_loop(file)
877 ret, out = run('losetup -d', dev)
879 log('unable to clean loop device:', dev, 'for file:', file)
882 # determine if dev is formatted as a <fstype> filesystem
883 def need_format(fstype, dev):
884 # FIXME don't know how to implement this
887 # initialize a block device if needed
888 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
889 inode_size, mkfsoptions, backfstype, backdev):
893 if fstype == 'smfs' or not is_block(dev):
894 dev = init_loop(dev, size, fstype, journal_size, inode_size,
895 mkfsoptions, reformat, autoformat, backfstype, backdev)
896 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
897 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
900 # panic("device:", dev,
901 # "not prepared, and autoformat is not set.\n",
902 # "Rerun with --reformat option to format ALL filesystems")
907 """lookup IP address for an interface"""
908 rc, out = run("/sbin/ifconfig", iface)
911 addr = string.split(out[1])[1]
912 ip = string.split(addr, ':')[1]
915 def def_mount_options(fstype, target):
916 """returns deafult mount options for passed fstype and target (mds, ost)"""
917 if fstype == 'ext3' or fstype == 'ldiskfs':
918 mountfsoptions = "errors=remount-ro"
919 if target == 'ost' and sys_get_branch() == '2.4':
920 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
921 return mountfsoptions
924 def sys_get_elan_position_file():
925 procfiles = ["/proc/elan/device0/position",
926 "/proc/qsnet/elan4/device0/position",
927 "/proc/qsnet/elan3/device0/position"]
929 if os.access(p, os.R_OK):
933 def sys_get_local_nid(net_type, wildcard, cluster_id):
934 """Return the local nid."""
936 if sys_get_elan_position_file():
937 local = sys_get_local_address('elan', '*', cluster_id)
939 local = sys_get_local_address(net_type, wildcard, cluster_id)
942 def sys_get_local_address(net_type, wildcard, cluster_id):
943 """Return the local address for the network type."""
945 if net_type in ('tcp',):
947 iface, star = string.split(wildcard, ':')
948 local = if2addr(iface)
950 panic ("unable to determine ip for:", wildcard)
952 host = socket.gethostname()
953 local = socket.gethostbyname(host)
954 elif net_type == 'elan':
955 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
956 f = sys_get_elan_position_file()
958 panic ("unable to determine local Elan ID")
961 lines = fp.readlines()
969 nid = my_int(cluster_id) + my_int(elan_id)
971 except ValueError, e:
975 elif net_type == 'gm':
976 fixme("automatic local address for GM")
980 def sys_get_branch():
981 """Returns kernel release"""
983 fp = open('/proc/sys/kernel/osrelease')
984 lines = fp.readlines()
988 version = string.split(l)
989 a = string.split(version[0], '.')
990 return a[0] + '.' + a[1]
996 def mod_loaded(modname):
997 """Check if a module is already loaded. Look in /proc/modules for it."""
999 fp = open('/proc/modules')
1000 lines = fp.readlines()
1002 # please forgive my tired fingers for this one
1003 ret = filter(lambda word, mod=modname: word == mod,
1004 map(lambda line: string.split(line)[0], lines))
1006 except Exception, e:
1009 # XXX: instead of device_list, ask for $name and see what we get
1010 def is_prepared(name):
1011 """Return true if a device exists for the name"""
1012 if config.lctl_dump:
1014 if (config.noexec or config.record) and config.cleanup:
1017 # expect this format:
1018 # 1 UP ldlm ldlm ldlm_UUID 2
1019 out = lctl.device_list()
1021 if name == string.split(s)[3]:
1023 except CommandError, e:
1027 def is_network_prepared():
1028 """If the any device exists, then assume that all networking
1029 has been configured"""
1030 out = lctl.device_list()
1033 def fs_is_mounted(path):
1034 """Return true if path is a mounted lustre filesystem"""
1036 fp = open('/proc/mounts')
1037 lines = fp.readlines()
1041 if a[1] == path and a[2] == 'lustre_lite':
1049 """Manage kernel modules"""
1050 def __init__(self, lustre_dir, portals_dir):
1051 self.lustre_dir = lustre_dir
1052 self.portals_dir = portals_dir
1053 self.kmodule_list = []
1055 def add_portals_module(self, dev_dir, modname):
1056 """Append a module to list of modules to load."""
1057 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1059 def add_lustre_module(self, dev_dir, modname):
1060 """Append a module to list of modules to load."""
1061 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1063 def load_module(self):
1064 """Load all the modules in the list in the order they appear."""
1065 for src_dir, dev_dir, mod in self.kmodule_list:
1066 if mod_loaded(mod) and not config.noexec:
1068 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1070 module = find_module(src_dir, dev_dir, mod)
1072 panic('module not found:', mod)
1073 (rc, out) = run('/sbin/insmod', module)
1075 raise CommandError('insmod', out, rc)
1077 (rc, out) = run('/sbin/modprobe', mod)
1079 raise CommandError('modprobe', out, rc)
1081 def cleanup_module(self):
1082 """Unload the modules in the list in reverse order."""
1083 rev = self.kmodule_list
1085 for src_dir, dev_dir, mod in rev:
1086 if not mod_loaded(mod) and not config.noexec:
1089 if mod == 'portals' and config.dump:
1090 lctl.dump(config.dump)
1091 log('unloading module:', mod)
1092 (rc, out) = run('/sbin/rmmod', mod)
1094 log('! unable to unload module:', mod)
1097 # ============================================================
1098 # Classes to prepare and cleanup the various objects
1101 """ Base class for the rest of the modules. The default cleanup method is
1102 defined here, as well as some utilitiy funcs.
1104 def __init__(self, module_name, db):
1106 self.module_name = module_name
1107 self.name = self.db.getName()
1108 self.uuid = self.db.getUUID()
1111 self.kmod = kmod(config.lustre, config.portals)
1113 def info(self, *args):
1114 msg = string.join(map(str,args))
1115 print self.module_name + ":", self.name, self.uuid, msg
1118 """ default cleanup, used for most modules """
1121 lctl.cleanup(self.name, self.uuid, config.force)
1122 except CommandError, e:
1123 log(self.module_name, "cleanup failed: ", self.name)
1127 def add_portals_module(self, dev_dir, modname):
1128 """Append a module to list of modules to load."""
1129 self.kmod.add_portals_module(dev_dir, modname)
1131 def add_lustre_module(self, dev_dir, modname):
1132 """Append a module to list of modules to load."""
1133 self.kmod.add_lustre_module(dev_dir, modname)
1135 def load_module(self):
1136 """Load all the modules in the list in the order they appear."""
1137 self.kmod.load_module()
1139 def cleanup_module(self):
1140 """Unload the modules in the list in reverse order."""
1141 if self.safe_to_clean():
1142 self.kmod.cleanup_module()
1144 def safe_to_clean(self):
1147 def safe_to_clean_modules(self):
1148 return self.safe_to_clean()
1150 class Network(Module):
1151 def __init__(self,db):
1152 Module.__init__(self, 'NETWORK', db)
1153 self.net_type = self.db.get_val('nettype')
1154 self.nid = self.db.get_val('nid', '*')
1155 self.cluster_id = self.db.get_val('clusterid', "0")
1156 self.port = self.db.get_val_int('port', 0)
1157 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1158 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1159 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1162 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1164 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1165 self.generic_nid = 1
1166 debug("nid:", self.nid)
1168 self.generic_nid = 0
1170 self.nid_uuid = self.nid_to_uuid(self.nid)
1172 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1173 if '*' in self.hostaddr:
1174 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1175 if not self.hostaddr:
1176 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1177 debug("hostaddr:", self.hostaddr)
1179 self.add_portals_module("libcfs", 'libcfs')
1180 self.add_portals_module("portals", 'portals')
1181 if node_needs_router():
1182 self.add_portals_module("router", 'kptlrouter')
1183 if self.net_type == 'tcp':
1184 self.add_portals_module("knals/socknal", 'ksocknal')
1185 if self.net_type == 'elan':
1186 self.add_portals_module("knals/qswnal", 'kqswnal')
1187 if self.net_type == 'gm':
1188 self.add_portals_module("knals/gmnal", 'kgmnal')
1190 def nid_to_uuid(self, nid):
1191 return "NID_%s_UUID" %(nid,)
1194 if not config.record and is_network_prepared():
1196 self.info(self.net_type, self.nid, self.port)
1197 if not (config.record and self.generic_nid):
1198 lctl.network(self.net_type, self.nid)
1199 if self.net_type == 'tcp':
1201 if self.net_type == 'elan':
1203 if self.port and node_is_router():
1204 run_one_acceptor(self.port)
1205 self.connect_peer_gateways()
1207 def connect_peer_gateways(self):
1208 for router in self.db.lookup_class('node'):
1209 if router.get_val_int('router', 0):
1210 for netuuid in router.get_networks():
1211 net = self.db.lookup(netuuid)
1213 if (gw.cluster_id == self.cluster_id and
1214 gw.net_type == self.net_type):
1215 if gw.nid != self.nid:
1218 def disconnect_peer_gateways(self):
1219 for router in self.db.lookup_class('node'):
1220 if router.get_val_int('router', 0):
1221 for netuuid in router.get_networks():
1222 net = self.db.lookup(netuuid)
1224 if (gw.cluster_id == self.cluster_id and
1225 gw.net_type == self.net_type):
1226 if gw.nid != self.nid:
1229 except CommandError, e:
1230 print "disconnect failed: ", self.name
1234 def safe_to_clean(self):
1235 return not is_network_prepared()
1238 self.info(self.net_type, self.nid, self.port)
1240 stop_acceptor(self.port)
1241 if node_is_router():
1242 self.disconnect_peer_gateways()
1244 def correct_level(self, level, op=None):
1247 class RouteTable(Module):
1248 def __init__(self,db):
1249 Module.__init__(self, 'ROUTES', db)
1251 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1253 # only setup connections for tcp NALs
1255 if not net_type in ('tcp',):
1258 # connect to target if route is to single node and this node is the gw
1259 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1260 if not local_cluster(net_type, tgt_cluster_id):
1261 panic("target", lo, " not on the local cluster")
1262 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1263 # connect to gateway if this node is not the gw
1264 elif (local_cluster(net_type, gw_cluster_id)
1265 and not local_interface(net_type, gw_cluster_id, gw)):
1266 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1271 panic("no server for nid", lo)
1274 return Network(srvdb)
1277 if not config.record and is_network_prepared():
1280 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1281 lctl.add_route(net_type, gw, lo, hi)
1282 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1286 def safe_to_clean(self):
1287 return not is_network_prepared()
1290 if is_network_prepared():
1291 # the network is still being used, don't clean it up
1293 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1294 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1297 lctl.disconnect(srv)
1298 except CommandError, e:
1299 print "disconnect failed: ", self.name
1304 lctl.del_route(net_type, gw, lo, hi)
1305 except CommandError, e:
1306 print "del_route failed: ", self.name
1310 class Management(Module):
1311 def __init__(self, db):
1312 Module.__init__(self, 'MGMT', db)
1313 self.add_lustre_module('lvfs', 'lvfs')
1314 self.add_lustre_module('obdclass', 'obdclass')
1315 self.add_lustre_module('ptlrpc', 'ptlrpc')
1316 self.add_lustre_module('mgmt', 'mgmt_svc')
1319 if not config.record and is_prepared(self.name):
1322 lctl.newdev("mgmt", self.name, self.uuid)
1324 def safe_to_clean(self):
1328 if is_prepared(self.name):
1329 Module.cleanup(self)
1331 def correct_level(self, level, op=None):
1334 # This is only needed to load the modules; the LDLM device
1335 # is now created automatically.
1337 def __init__(self,db):
1338 Module.__init__(self, 'LDLM', db)
1339 self.add_lustre_module('lvfs', 'lvfs')
1340 self.add_lustre_module('obdclass', 'obdclass')
1341 self.add_lustre_module('ptlrpc', 'ptlrpc')
1349 def correct_level(self, level, op=None):
1354 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1355 Module.__init__(self, 'LOV', db)
1356 if name_override != None:
1357 self.name = "lov_%s" % name_override
1358 self.add_lustre_module('lov', 'lov')
1359 self.mds_uuid = self.db.get_first_ref('mds')
1360 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1361 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1362 self.pattern = self.db.get_val_int('stripepattern', 0)
1363 self.devlist = self.db.get_lov_tgts('lov_tgt')
1364 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1366 self.desc_uuid = self.uuid
1367 self.uuid = generate_client_uuid(self.name)
1368 self.fs_name = fs_name
1370 self.config_only = 1
1372 self.config_only = None
1373 mds = self.db.lookup(self.mds_uuid)
1374 self.mds_name = mds.getName()
1375 for (obd_uuid, index, gen, active) in self.devlist:
1378 obd = self.db.lookup(obd_uuid)
1379 osc = get_osc(obd, self.uuid, fs_name)
1381 self.osclist.append((osc, index, gen, active))
1383 panic('osc not found:', obd_uuid)
1389 if not config.record and is_prepared(self.name):
1391 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1392 self.stripe_off, self.pattern, self.devlist,
1394 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1395 self.stripe_sz, self.stripe_off, self.pattern)
1396 for (osc, index, gen, active) in self.osclist:
1397 target_uuid = osc.target_uuid
1399 # Only ignore connect failures with --force, which
1400 # isn't implemented here yet.
1402 osc.prepare(ignore_connect_failure=0)
1403 except CommandError, e:
1404 print "Error preparing OSC %s\n" % osc.uuid
1406 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1409 for (osc, index, gen, active) in self.osclist:
1410 target_uuid = osc.target_uuid
1412 if is_prepared(self.name):
1413 Module.cleanup(self)
1414 if self.config_only:
1415 panic("Can't clean up config_only LOV ", self.name)
1417 def load_module(self):
1418 if self.config_only:
1419 panic("Can't load modules for config_only LOV ", self.name)
1420 for (osc, index, gen, active) in self.osclist:
1423 Module.load_module(self)
1425 def cleanup_module(self):
1426 if self.config_only:
1427 panic("Can't cleanup modules for config_only LOV ", self.name)
1428 Module.cleanup_module(self)
1429 for (osc, index, gen, active) in self.osclist:
1431 osc.cleanup_module()
1434 def correct_level(self, level, op=None):
1438 def __init__(self, db, uuid, fs_name, name_override = None):
1439 Module.__init__(self, 'LMV', db)
1440 if name_override != None:
1441 self.name = "lmv_%s" % name_override
1442 self.add_lustre_module('lmv', 'lmv')
1443 self.devlist = self.db.get_refs('mds')
1445 self.desc_uuid = self.uuid
1447 self.fs_name = fs_name
1448 for mds_uuid in self.devlist:
1449 mds = self.db.lookup(mds_uuid)
1451 panic("MDS not found!")
1452 mdc = MDC(mds, self.uuid, fs_name)
1454 self.mdclist.append(mdc)
1456 panic('mdc not found:', mds_uuid)
1459 if is_prepared(self.name):
1461 for mdc in self.mdclist:
1463 # Only ignore connect failures with --force, which
1464 # isn't implemented here yet.
1465 mdc.prepare(ignore_connect_failure=0)
1466 except CommandError, e:
1467 print "Error preparing LMV %s\n" % mdc.uuid
1469 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1470 string.join(self.devlist))
1473 for mdc in self.mdclist:
1475 if is_prepared(self.name):
1476 Module.cleanup(self)
1478 def load_module(self):
1479 for mdc in self.mdclist:
1482 Module.load_module(self)
1484 def cleanup_module(self):
1485 Module.cleanup_module(self)
1486 for mdc in self.mdclist:
1487 mdc.cleanup_module()
1490 def correct_level(self, level, op=None):
1493 class MDSDEV(Module):
1494 def __init__(self,db):
1495 Module.__init__(self, 'MDSDEV', db)
1496 self.devpath = self.db.get_val('devpath','')
1497 self.backdevpath = self.db.get_val('backdevpath','')
1498 self.size = self.db.get_val_int('devsize', 0)
1499 self.journal_size = self.db.get_val_int('journalsize', 0)
1500 self.fstype = self.db.get_val('fstype', '')
1501 self.backfstype = self.db.get_val('backfstype', '')
1502 self.nspath = self.db.get_val('nspath', '')
1503 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1504 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1505 self.root_squash = self.db.get_val('root_squash', '')
1506 self.no_root_squash = self.db.get_val('no_root_squash', '')
1507 self.cachetype = self.db.get_val('cachetype', '')
1508 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1509 target_uuid = self.db.get_first_ref('target')
1510 mds = self.db.lookup(target_uuid)
1511 self.name = mds.getName()
1512 self.filesystem_uuids = mds.get_refs('filesystem')
1515 self.master_mds = ""
1516 if not self.filesystem_uuids:
1517 self.lmv_uuid = self.db.get_first_ref('lmv')
1518 if not self.lmv_uuid:
1519 panic("ALERT: can't find lvm uuid")
1521 self.lmv = self.db.lookup(self.lmv_uuid)
1523 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1524 self.master_mds = self.lmv_uuid
1525 # FIXME: if fstype not set, then determine based on kernel version
1526 self.format = self.db.get_val('autoformat', "no")
1527 if mds.get_val('failover', 0):
1528 self.failover_mds = 'f'
1530 self.failover_mds = 'n'
1531 active_uuid = get_active_target(mds)
1533 panic("No target device found:", target_uuid)
1534 if active_uuid == self.uuid:
1538 if self.active and config.group and config.group != mds.get_val('group'):
1541 self.inode_size = self.db.get_val_int('inodesize', 0)
1542 if self.inode_size == 0:
1543 # find the LOV for this MDS
1544 lovconfig_uuid = mds.get_first_ref('lovconfig')
1545 if not lovconfig_uuid:
1546 if not self.lmv_uuid:
1547 panic("No LOV found for lovconfig ", lovconfig.name)
1550 panic("No LMV initialized and not lovconfig_uuid found")
1552 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1553 lovconfig = self.lmv.lookup(lovconfig_uuid)
1554 lov_uuid = lovconfig.get_first_ref('lov')
1556 panic("No LOV found for lovconfig ", lovconfig.name)
1558 lovconfig = mds.lookup(lovconfig_uuid)
1559 lov_uuid = lovconfig.get_first_ref('lov')
1561 panic("No LOV found for lovconfig ", lovconfig.name)
1564 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1565 lovconfig = self.lmv.lookup(lovconfig_uuid)
1566 lov_uuid = lovconfig.get_first_ref('lov')
1568 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1570 # default stripe count controls default inode_size
1571 stripe_count = lov.stripe_cnt
1572 if stripe_count > 77:
1573 self.inode_size = 4096
1574 elif stripe_count > 35:
1575 self.inode_size = 2048
1576 elif stripe_count > 13:
1577 self.inode_size = 1024
1578 elif stripe_count > 3:
1579 self.inode_size = 512
1581 self.inode_size = 256
1583 self.target_dev_uuid = self.uuid
1584 self.uuid = target_uuid
1587 client_uuid = generate_client_uuid(self.name)
1588 client_uuid = self.name + "_lmv_" + "UUID"
1589 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1590 self.master_mds = self.master.name
1593 self.add_lustre_module('mdc', 'mdc')
1594 self.add_lustre_module('osc', 'osc')
1595 self.add_lustre_module('lov', 'lov')
1596 self.add_lustre_module('lmv', 'lmv')
1597 self.add_lustre_module('ost', 'ost')
1598 self.add_lustre_module('mds', 'mds')
1600 if self.fstype == 'smfs':
1601 self.add_lustre_module('smfs', 'smfs')
1603 if self.fstype == 'ldiskfs':
1604 self.add_lustre_module('ldiskfs', 'ldiskfs')
1607 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1609 # if fstype is smfs, then we should also take care about backing
1611 if self.fstype == 'smfs':
1612 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1614 for options in string.split(self.mountfsoptions, ','):
1615 if options == 'snap':
1616 if not self.fstype == 'smfs':
1617 panic("mountoptions with snap, but fstype is not smfs\n")
1618 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1619 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1620 def load_module(self):
1622 Module.load_module(self)
1625 if not config.record and is_prepared(self.name):
1628 debug(self.uuid, "not active")
1631 # run write_conf automatically, if --reformat used
1633 self.info(self.devpath, self.fstype, self.size, self.format)
1637 self.master.prepare()
1638 # never reformat here
1639 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1640 self.format, self.journal_size, self.inode_size,
1641 self.mkfsoptions, self.backfstype, self.backdevpath)
1643 if not is_prepared('MDT'):
1644 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1646 mountfsoptions = def_mount_options(self.fstype, 'mds')
1648 if config.mountfsoptions:
1650 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1652 mountfsoptions = config.mountfsoptions
1653 if self.mountfsoptions:
1654 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1656 if self.mountfsoptions:
1658 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1660 mountfsoptions = self.mountfsoptions
1662 if self.fstype == 'smfs':
1663 realdev = self.fstype
1666 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1670 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1675 print 'MDS mount options: ' + mountfsoptions
1677 if not self.master_mds:
1678 self.master_mds = 'dumb'
1679 if not self.cachetype:
1680 self.cachetype = 'dumb'
1681 lctl.newdev("mds", self.name, self.uuid,
1682 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1683 self.name, mountfsoptions,
1684 self.master_mds, self.cachetype))
1685 except CommandError, e:
1687 panic("MDS is missing the config log. Need to run " +
1688 "lconf --write_conf.")
1692 if config.root_squash == None:
1693 config.root_squash = self.root_squash
1694 if config.no_root_squash == None:
1695 config.no_root_squash = self.no_root_squash
1696 if config.root_squash:
1697 if config.no_root_squash:
1698 nsnid = config.no_root_squash
1701 lctl.root_squash(self.name, config.root_squash, nsnid)
1703 def write_conf(self):
1705 if not is_prepared(self.name):
1706 self.info(self.devpath, self.fstype, self.format)
1708 blkdev = block_dev(self.devpath, self.size, self.fstype,
1709 config.reformat, self.format, self.journal_size,
1710 self.inode_size, self.mkfsoptions,
1711 self.backfstype, self.backdevpath)
1713 # Even for writing logs we mount mds with supplied mount options
1714 # because it will not mount smfs (if used) otherwise.
1716 mountfsoptions = def_mount_options(self.fstype, 'mds')
1718 if config.mountfsoptions:
1720 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1722 mountfsoptions = config.mountfsoptions
1723 if self.mountfsoptions:
1724 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1726 if self.mountfsoptions:
1728 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1730 mountfsoptions = self.mountfsoptions
1732 if self.fstype == 'smfs':
1733 realdev = self.fstype
1736 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1740 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1745 print 'MDS mount options: ' + mountfsoptions
1747 # As mount options are passed by 4th param to config tool, we need
1748 # to pass something in 3rd param. But we do not want this 3rd param
1749 # be counted as a profile name for reading log on MDS setup, thus,
1750 # we pass there some predefined sign like 'dumb', which will be
1751 # checked in MDS code and skipped. Probably there is more nice way
1752 # like pass empty string and check it in config tool and pass null
1754 lctl.newdev("mds", self.name, self.uuid,
1755 setup ="%s %s %s %s" %(realdev, self.fstype,
1756 'dumb', mountfsoptions))
1759 # record logs for the MDS lov
1760 for uuid in self.filesystem_uuids:
1761 log("recording clients for filesystem:", uuid)
1762 fs = self.db.lookup(uuid)
1764 # this is ugly, should be organized nice later.
1765 target_uuid = self.db.get_first_ref('target')
1766 mds = self.db.lookup(target_uuid)
1768 lovconfig_uuid = mds.get_first_ref('lovconfig')
1770 lovconfig = mds.lookup(lovconfig_uuid)
1771 obd_uuid = lovconfig.get_first_ref('lov')
1773 obd_uuid = fs.get_first_ref('obd')
1775 client_uuid = generate_client_uuid(self.name)
1776 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1779 lctl.clear_log(self.name, self.name)
1780 lctl.record(self.name, self.name)
1782 lctl.mount_option(self.name, client.get_name(), "")
1784 process_updates(self.db, self.name, self.name, client)
1787 lctl.clear_log(self.name, self.name + '-clean')
1788 lctl.record(self.name, self.name + '-clean')
1790 lctl.del_mount_option(self.name)
1792 process_updates(self.db, self.name, self.name + '-clean', client)
1796 # record logs for each client
1802 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1804 config_options = CONFIG_FILE
1806 for node_db in self.db.lookup_class('node'):
1807 client_name = node_db.getName()
1808 for prof_uuid in node_db.get_refs('profile'):
1809 prof_db = node_db.lookup(prof_uuid)
1810 # refactor this into a funtion to test "clientness"
1812 for ref_class, ref_uuid in prof_db.get_all_refs():
1813 if ref_class in ('mountpoint','echoclient'):
1814 debug("recording", client_name)
1815 old_noexec = config.noexec
1817 ret, out = run (sys.argv[0], noexec_opt,
1818 " -v --record --nomod",
1819 "--record_log", client_name,
1820 "--record_device", self.name,
1821 "--node", client_name,
1824 for s in out: log("record> ", string.strip(s))
1825 ret, out = run (sys.argv[0], noexec_opt,
1826 "--cleanup -v --record --nomod",
1827 "--record_log", client_name + "-clean",
1828 "--record_device", self.name,
1829 "--node", client_name,
1832 for s in out: log("record> ", string.strip(s))
1833 config.noexec = old_noexec
1836 lctl.cleanup(self.name, self.uuid, 0, 0)
1837 except CommandError, e:
1838 log(self.module_name, "cleanup failed: ", self.name)
1841 Module.cleanup(self)
1843 if self.fstype == 'smfs':
1844 clean_loop(self.backdevpath)
1846 clean_loop(self.devpath)
1848 def msd_remaining(self):
1849 out = lctl.device_list()
1851 if string.split(s)[2] in ('mds',):
1854 def safe_to_clean(self):
1857 def safe_to_clean_modules(self):
1858 return not self.msd_remaining()
1862 debug(self.uuid, "not active")
1865 if is_prepared(self.name):
1867 lctl.cleanup(self.name, self.uuid, config.force,
1869 except CommandError, e:
1870 log(self.module_name, "cleanup failed: ", self.name)
1873 Module.cleanup(self)
1876 self.master.cleanup()
1877 if not self.msd_remaining() and is_prepared('MDT'):
1879 lctl.cleanup("MDT", "MDT_UUID", config.force,
1881 except CommandError, e:
1882 print "cleanup failed: ", self.name
1886 if self.fstype == 'smfs':
1887 clean_loop(self.backdevpath)
1889 clean_loop(self.devpath)
1891 def correct_level(self, level, op=None):
1892 #if self.master_mds:
1897 def __init__(self, db):
1898 Module.__init__(self, 'OSD', db)
1899 self.osdtype = self.db.get_val('osdtype')
1900 self.devpath = self.db.get_val('devpath', '')
1901 self.backdevpath = self.db.get_val('backdevpath', '')
1902 self.size = self.db.get_val_int('devsize', 0)
1903 self.journal_size = self.db.get_val_int('journalsize', 0)
1904 self.inode_size = self.db.get_val_int('inodesize', 0)
1905 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1906 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1907 self.fstype = self.db.get_val('fstype', '')
1908 self.backfstype = self.db.get_val('backfstype', '')
1909 self.nspath = self.db.get_val('nspath', '')
1910 target_uuid = self.db.get_first_ref('target')
1911 ost = self.db.lookup(target_uuid)
1912 self.name = ost.getName()
1913 self.format = self.db.get_val('autoformat', 'yes')
1914 if ost.get_val('failover', 0):
1915 self.failover_ost = 'f'
1917 self.failover_ost = 'n'
1919 active_uuid = get_active_target(ost)
1921 panic("No target device found:", target_uuid)
1922 if active_uuid == self.uuid:
1926 if self.active and config.group and config.group != ost.get_val('group'):
1929 self.target_dev_uuid = self.uuid
1930 self.uuid = target_uuid
1932 self.add_lustre_module('ost', 'ost')
1933 if self.fstype == 'smfs':
1934 self.add_lustre_module('smfs', 'smfs')
1935 # FIXME: should we default to ext3 here?
1936 if self.fstype == 'ldiskfs':
1937 self.add_lustre_module('ldiskfs', 'ldiskfs')
1939 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1940 if self.fstype == 'smfs':
1941 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1943 for options in self.mountfsoptions:
1944 if options == 'snap':
1945 if not self.fstype == 'smfs':
1946 panic("mountoptions with snap, but fstype is not smfs\n")
1947 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1948 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1950 self.add_lustre_module(self.osdtype, self.osdtype)
1952 def load_module(self):
1954 Module.load_module(self)
1956 # need to check /proc/mounts and /etc/mtab before
1957 # formatting anything.
1958 # FIXME: check if device is already formatted.
1960 if is_prepared(self.name):
1963 debug(self.uuid, "not active")
1965 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1966 self.format, self.journal_size, self.inode_size)
1968 if self.osdtype == 'obdecho':
1971 blkdev = block_dev(self.devpath, self.size, self.fstype,
1972 config.reformat, self.format, self.journal_size,
1973 self.inode_size, self.mkfsoptions, self.backfstype,
1976 mountfsoptions = def_mount_options(self.fstype, 'ost')
1978 if config.mountfsoptions:
1980 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1982 mountfsoptions = config.mountfsoptions
1983 if self.mountfsoptions:
1984 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1986 if self.mountfsoptions:
1988 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1990 mountfsoptions = self.mountfsoptions
1992 if self.fstype == 'smfs':
1993 realdev = self.fstype
1996 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
2000 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
2005 print 'OSD mount options: ' + mountfsoptions
2007 lctl.newdev(self.osdtype, self.name, self.uuid,
2008 setup ="%s %s %s %s" %(realdev, self.fstype,
2011 if not is_prepared('OSS'):
2012 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
2014 def osd_remaining(self):
2015 out = lctl.device_list()
2017 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2020 def safe_to_clean(self):
2023 def safe_to_clean_modules(self):
2024 return not self.osd_remaining()
2028 debug(self.uuid, "not active")
2030 if is_prepared(self.name):
2033 lctl.cleanup(self.name, self.uuid, config.force,
2035 except CommandError, e:
2036 log(self.module_name, "cleanup failed: ", self.name)
2039 if not self.osd_remaining() and is_prepared('OSS'):
2041 lctl.cleanup("OSS", "OSS_UUID", config.force,
2043 except CommandError, e:
2044 print "cleanup failed: ", self.name
2047 if not self.osdtype == 'obdecho':
2048 if self.fstype == 'smfs':
2049 clean_loop(self.backdevpath)
2051 clean_loop(self.devpath)
2053 def correct_level(self, level, op=None):
2056 def mgmt_uuid_for_fs(mtpt_name):
2059 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2060 fs_uuid = mtpt_db.get_first_ref('filesystem')
2061 fs = toplustreDB.lookup(fs_uuid)
2064 return fs.get_first_ref('mgmt')
2066 # Generic client module, used by OSC and MDC
2067 class Client(Module):
2068 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2070 self.target_name = tgtdb.getName()
2071 self.target_uuid = tgtdb.getUUID()
2075 self.tgt_dev_uuid = get_active_target(tgtdb)
2076 if not self.tgt_dev_uuid:
2077 panic("No target device found for target(1):", self.target_name)
2079 self.kmod = kmod(config.lustre, config.portals)
2083 self.module = module
2084 self.module_name = string.upper(module)
2086 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2087 self.target_name, fs_name)
2089 self.name = self_name
2091 self.lookup_server(self.tgt_dev_uuid)
2092 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2094 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2097 self.fs_name = fs_name
2100 self.add_lustre_module(module_dir, module)
2102 def lookup_server(self, srv_uuid):
2103 """ Lookup a server's network information """
2104 self._server_nets = get_ost_net(self.db, srv_uuid)
2105 if len(self._server_nets) == 0:
2106 panic ("Unable to find a server for:", srv_uuid)
2109 def get_servers(self):
2110 return self._server_nets
2112 def prepare(self, ignore_connect_failure = 0):
2113 self.info(self.target_uuid)
2114 if not config.record and is_prepared(self.name):
2117 srv = choose_local_server(self.get_servers())
2121 routes = find_route(self.get_servers())
2122 if len(routes) == 0:
2123 panic ("no route to", self.target_uuid)
2124 for (srv, r) in routes:
2125 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2126 except CommandError, e:
2127 if not ignore_connect_failure:
2130 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2131 debug("%s inactive" % self.target_uuid)
2132 inactive_p = "inactive"
2134 debug("%s active" % self.target_uuid)
2136 lctl.newdev(self.module, self.name, self.uuid,
2137 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2138 inactive_p, self.mgmt_name))
2141 if is_prepared(self.name):
2142 Module.cleanup(self)
2144 srv = choose_local_server(self.get_servers())
2146 lctl.disconnect(srv)
2148 for (srv, r) in find_route(self.get_servers()):
2149 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2150 except CommandError, e:
2151 log(self.module_name, "cleanup failed: ", self.name)
2155 def correct_level(self, level, op=None):
2158 def deactivate(self):
2160 lctl.deactivate(self.name)
2161 except CommandError, e:
2162 log(self.module_name, "deactivate failed: ", self.name)
2167 def __init__(self, db, uuid, fs_name):
2168 Client.__init__(self, db, uuid, 'mdc', fs_name)
2170 def permits_inactive(self):
2174 def __init__(self, db, uuid, fs_name):
2175 Client.__init__(self, db, uuid, 'osc', fs_name)
2177 def permits_inactive(self):
2180 def mgmtcli_name_for_uuid(uuid):
2181 return 'MGMTCLI_%s' % uuid
2183 class ManagementClient(Client):
2184 def __init__(self, db, uuid):
2185 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2186 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2187 module_dir = 'mgmt')
2189 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
2190 Module.__init__(self, 'VLOV', db)
2191 if name_override != None:
2192 self.name = "lov_%s" % name_override
2193 self.add_lustre_module('lov', 'lov')
2194 self.stripe_sz = 65536
2198 self.desc_uuid = self.uuid
2199 self.uuid = generate_client_uuid(self.name)
2200 self.fs_name = fs_name
2201 self.osc = get_osc(db, self.uuid, fs_name)
2203 panic('osc not found:', self.uuid)
2205 self.config_only = 1
2207 self.config_only = None
2213 if not config.record and is_prepared(self.name):
2215 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
2216 self.stripe_sz, self.stripe_off, self.pattern)
2217 target_uuid = self.osc.target_uuid
2220 self.osc.prepare(ignore_connect_failure=0)
2221 except CommandError, e:
2222 print "Error preparing OSC %s\n" % osc.uuid
2224 lctl.lov_add_obd(self.name, self.uuid, target_uuid, 0, 1)
2227 target_uuid = self.osc.target_uuid
2229 if is_prepared(self.name):
2230 Module.cleanup(self)
2231 if self.config_only:
2232 panic("Can't clean up config_only LOV ", self.name)
2234 def load_module(self):
2235 if self.config_only:
2236 panic("Can't load modules for config_only LOV ", self.name)
2237 self.osc.load_module()
2238 Module.load_module(self)
2240 def cleanup_module(self):
2241 if self.config_only:
2242 panic("Can't cleanup modules for config_only LOV ", self.name)
2243 Module.cleanup_module(self)
2244 self.osc.cleanup_module()
2246 def correct_level(self, level, op=None):
2249 class CMOBD(Module):
2250 def __init__(self,db):
2251 Module.__init__(self, 'CMOBD', db)
2252 self.name = self.db.getName();
2253 self.uuid = generate_client_uuid(self.name)
2254 self.master_uuid = self.db.get_first_ref('masterobd')
2255 self.cache_uuid = self.db.get_first_ref('cacheobd')
2256 self.add_lustre_module('cmobd', 'cmobd')
2257 master_obd = self.db.lookup(self.master_uuid)
2259 panic('master obd not found:', self.master_uuid)
2260 cache_obd = self.db.lookup(self.cache_uuid)
2262 panic('cache obd not found:', self.cache_uuid)
2264 if master_obd.get_class() == 'ost':
2265 self.client_uuid = generate_client_uuid(self.name)
2266 self.master= VLOV(master_obd, self.client_uuid, self.name,
2267 "%s_master" % (self.name))
2268 self.master_uuid = self.master.get_uuid()
2270 self.master = get_mdc(db, self.name, self.master_uuid)
2271 # need to check /proc/mounts and /etc/mtab before
2272 # formatting anything.
2273 # FIXME: check if device is already formatted.
2275 self.master.prepare()
2276 if not config.record and is_prepared(self.name):
2278 self.info(self.master_uuid, self.cache_uuid)
2279 lctl.newdev("cmobd", self.name, self.uuid,
2280 setup ="%s %s" %(self.master_uuid,
2284 if is_prepared(self.name):
2285 Module.cleanup(self)
2286 self.master.cleanup()
2288 def load_module(self):
2289 self.master.load_module()
2290 Module.load_module(self)
2292 def cleanup_module(self):
2293 Module.cleanup_module(self)
2294 self.master.cleanup_module()
2296 def correct_level(self, level, op=None):
2300 def __init__(self, db, uuid, name, type, name_override = None):
2301 Module.__init__(self, 'COBD', db)
2302 self.name = self.db.getName();
2303 self.uuid = generate_client_uuid(self.name)
2304 self.real_uuid = self.db.get_first_ref('realobd')
2305 self.cache_uuid = self.db.get_first_ref('cacheobd')
2306 self.add_lustre_module('cobd', 'cobd')
2307 real_obd = self.db.lookup(self.real_uuid)
2309 panic('real obd not found:', self.real_uuid)
2310 cache_obd = self.db.lookup(self.cache_uuid)
2312 panic('cache obd not found:', self.cache_uuid)
2314 self.real = LOV(real_obd, self.real_uuid, name,
2315 "%s_real" % (self.name));
2316 self.cache = LOV(cache_obd, self.cache_uuid, name,
2317 "%s_cache" % (self.name));
2319 self.real = get_mdc(db, name, self.real_uuid)
2320 self.cache = get_mdc(db, name, self.cache_uuid)
2321 # need to check /proc/mounts and /etc/mtab before
2322 # formatting anything.
2323 # FIXME: check if device is already formatted.
2328 def get_real_name(self):
2329 return self.real.name
2330 def get_cache_name(self):
2331 return self.cache.name
2334 self.cache.prepare()
2335 if not config.record and is_prepared(self.name):
2337 self.info(self.real_uuid, self.cache_uuid)
2338 lctl.newdev("cobd", self.name, self.uuid,
2339 setup ="%s %s" %(self.real.name,
2343 if is_prepared(self.name):
2344 Module.cleanup(self)
2346 self.cache.cleanup()
2348 def load_module(self):
2349 self.real.load_module()
2350 Module.load_module(self)
2352 def cleanup_module(self):
2353 Module.cleanup_module(self)
2354 self.real.cleanup_module()
2356 # virtual interface for OSC and LOV
2358 def __init__(self, db, client_uuid, name, name_override = None):
2359 Module.__init__(self, 'VOSC', db)
2360 if db.get_class() == 'lov':
2361 self.osc = LOV(db, client_uuid, name, name_override)
2363 elif db.get_class() == 'cobd':
2364 self.osc = COBD(db, client_uuid, name, 'obd')
2367 self.osc = OSC(db, client_uuid, name)
2370 return self.osc.get_uuid()
2372 return self.osc.get_name()
2377 def load_module(self):
2378 self.osc.load_module()
2379 def cleanup_module(self):
2380 self.osc.cleanup_module()
2381 def correct_level(self, level, op=None):
2382 return self.osc.correct_level(level, op)
2384 # virtual interface for MDC and LMV
2386 def __init__(self, db, client_uuid, name, name_override = None):
2387 Module.__init__(self, 'VMDC', db)
2388 if db.get_class() == 'lmv':
2389 self.mdc = LMV(db, client_uuid, name)
2390 elif db.get_class() == 'cobd':
2391 self.mdc = COBD(db, client_uuid, name, 'mds')
2393 self.mdc = MDC(db, client_uuid, name)
2395 return self.mdc.uuid
2397 return self.mdc.name
2402 def load_module(self):
2403 self.mdc.load_module()
2404 def cleanup_module(self):
2405 self.mdc.cleanup_module()
2406 def correct_level(self, level, op=None):
2407 return self.mdc.correct_level(level, op)
2409 class ECHO_CLIENT(Module):
2410 def __init__(self,db):
2411 Module.__init__(self, 'ECHO_CLIENT', db)
2412 self.add_lustre_module('obdecho', 'obdecho')
2413 self.obd_uuid = self.db.get_first_ref('obd')
2414 obd = self.db.lookup(self.obd_uuid)
2415 self.uuid = generate_client_uuid(self.name)
2416 self.osc = VOSC(obd, self.uuid, self.name)
2419 if not config.record and is_prepared(self.name):
2422 self.osc.prepare() # XXX This is so cheating. -p
2423 self.info(self.obd_uuid)
2425 lctl.newdev("echo_client", self.name, self.uuid,
2426 setup = self.osc.get_name())
2429 if is_prepared(self.name):
2430 Module.cleanup(self)
2433 def load_module(self):
2434 self.osc.load_module()
2435 Module.load_module(self)
2437 def cleanup_module(self):
2438 Module.cleanup_module(self)
2439 self.osc.cleanup_module()
2441 def correct_level(self, level, op=None):
2444 def generate_client_uuid(name):
2445 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2447 int(random.random() * 1048576),
2448 int(random.random() * 1048576))
2449 return client_uuid[:36]
2451 class Mountpoint(Module):
2452 def __init__(self,db):
2453 Module.__init__(self, 'MTPT', db)
2454 self.path = self.db.get_val('path')
2455 self.clientoptions = self.db.get_val('clientoptions', '')
2456 self.fs_uuid = self.db.get_first_ref('filesystem')
2457 fs = self.db.lookup(self.fs_uuid)
2458 self.mds_uuid = fs.get_first_ref('lmv')
2459 if not self.mds_uuid:
2460 self.mds_uuid = fs.get_first_ref('mds')
2461 self.obd_uuid = fs.get_first_ref('obd')
2462 self.mgmt_uuid = fs.get_first_ref('mgmt')
2463 client_uuid = generate_client_uuid(self.name)
2465 ost = self.db.lookup(self.obd_uuid)
2467 panic("no ost: ", self.obd_uuid)
2469 mds = self.db.lookup(self.mds_uuid)
2471 panic("no mds: ", self.mds_uuid)
2473 self.add_lustre_module('mdc', 'mdc')
2474 self.add_lustre_module('lmv', 'lmv')
2475 self.add_lustre_module('llite', 'llite')
2477 self.vosc = VOSC(ost, client_uuid, self.name)
2478 self.vmdc = VMDC(mds, client_uuid, self.name)
2481 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2487 if not config.record and fs_is_mounted(self.path):
2488 log(self.path, "already mounted.")
2492 self.mgmtcli.prepare()
2495 vmdc_name = self.vmdc.get_name()
2497 self.info(self.path, self.mds_uuid, self.obd_uuid)
2498 if config.record or config.lctl_dump:
2499 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2502 if config.clientoptions:
2503 if self.clientoptions:
2504 self.clientoptions = self.clientoptions + ',' + \
2505 config.clientoptions
2507 self.clientoptions = config.clientoptions
2508 if self.clientoptions:
2509 self.clientoptions = ',' + self.clientoptions
2510 # Linux kernel will deal with async and not pass it to ll_fill_super,
2511 # so replace it with Lustre async
2512 self.clientoptions = string.replace(self.clientoptions, "async",
2515 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2516 (self.vosc.get_name(), vmdc_name, self.clientoptions,
2517 config.config, self.path)
2518 run("mkdir", self.path)
2523 panic("mount failed:", self.path, ":", string.join(val))
2526 self.info(self.path, self.mds_uuid,self.obd_uuid)
2528 if config.record or config.lctl_dump:
2529 lctl.del_mount_option(local_node_name)
2531 if fs_is_mounted(self.path):
2533 (rc, out) = run("umount", "-f", self.path)
2535 (rc, out) = run("umount", self.path)
2537 raise CommandError('umount', out, rc)
2539 if fs_is_mounted(self.path):
2540 panic("fs is still mounted:", self.path)
2545 self.mgmtcli.cleanup()
2547 def load_module(self):
2549 self.mgmtcli.load_module()
2550 self.vosc.load_module()
2551 Module.load_module(self)
2553 def cleanup_module(self):
2554 Module.cleanup_module(self)
2555 self.vosc.cleanup_module()
2557 self.mgmtcli.cleanup_module()
2559 def correct_level(self, level, op=None):
2562 # ============================================================
2563 # misc query functions
2565 def get_ost_net(self, osd_uuid):
2569 osd = self.lookup(osd_uuid)
2570 node_uuid = osd.get_first_ref('node')
2571 node = self.lookup(node_uuid)
2573 panic("unable to find node for osd_uuid:", osd_uuid,
2574 " node_ref:", node_uuid_)
2575 for net_uuid in node.get_networks():
2576 db = node.lookup(net_uuid)
2577 srv_list.append(Network(db))
2581 # the order of iniitailization is based on level.
2582 def getServiceLevel(self):
2583 type = self.get_class()
2585 if type in ('network',):
2587 elif type in ('routetbl',):
2589 elif type in ('ldlm',):
2591 elif type in ('mgmt',):
2593 elif type in ('osd', 'cobd'):
2595 elif type in ('mdsdev',):
2597 elif type in ('lmv',):
2599 elif type in ('cmobd',):
2601 elif type in ('mountpoint', 'echoclient'):
2604 panic("Unknown type: ", type)
2606 if ret < config.minlevel or ret > config.maxlevel:
2611 # return list of services in a profile. list is a list of tuples
2612 # [(level, db_object),]
2613 def getServices(self):
2615 for ref_class, ref_uuid in self.get_all_refs():
2616 servdb = self.lookup(ref_uuid)
2618 level = getServiceLevel(servdb)
2620 list.append((level, servdb))
2622 panic('service not found: ' + ref_uuid)
2628 ############################################################
2630 # FIXME: clean this mess up!
2632 # OSC is no longer in the xml, so we have to fake it.
2633 # this is getting ugly and begging for another refactoring
2634 def get_osc(ost_db, uuid, fs_name):
2635 osc = OSC(ost_db, uuid, fs_name)
2638 def get_mdc(db, fs_name, mds_uuid):
2639 mds_db = db.lookup(mds_uuid);
2641 error("no mds:", mds_uuid)
2642 mdc = MDC(mds_db, mds_uuid, fs_name)
2645 ############################################################
2646 # routing ("rooting")
2647 # list of (nettype, cluster_id, nid)
2650 def find_local_clusters(node_db):
2651 global local_clusters
2652 for netuuid in node_db.get_networks():
2653 net = node_db.lookup(netuuid)
2655 debug("add_local", netuuid)
2656 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2658 if acceptors.has_key(srv.port):
2659 panic("duplicate port:", srv.port)
2660 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2661 srv.send_mem, srv.recv_mem,
2664 # This node is a gateway.
2666 def node_is_router():
2669 # If there are any routers found in the config, then this will be true
2670 # and all nodes will load kptlrouter.
2672 def node_needs_router():
2673 return needs_router or is_router
2675 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2676 # Currently, these local routes are only added to kptlrouter route
2677 # table if they are needed to connect to a specific server. This
2678 # should be changed so all available routes are loaded, and the
2679 # ptlrouter can make all the decisions.
2682 def find_local_routes(lustre):
2683 """ Scan the lustre config looking for routers . Build list of
2685 global local_routes, needs_router
2687 list = lustre.lookup_class('node')
2689 if router.get_val_int('router', 0):
2691 for (local_type, local_cluster_id, local_nid) in local_clusters:
2693 for netuuid in router.get_networks():
2694 db = router.lookup(netuuid)
2695 if (local_type == db.get_val('nettype') and
2696 local_cluster_id == db.get_val('clusterid')):
2697 gw = db.get_val('nid')
2700 debug("find_local_routes: gw is", gw)
2701 for route in router.get_local_routes(local_type, gw):
2702 local_routes.append(route)
2703 debug("find_local_routes:", local_routes)
2706 def choose_local_server(srv_list):
2707 for srv in srv_list:
2708 if local_cluster(srv.net_type, srv.cluster_id):
2711 def local_cluster(net_type, cluster_id):
2712 for cluster in local_clusters:
2713 if net_type == cluster[0] and cluster_id == cluster[1]:
2717 def local_interface(net_type, cluster_id, nid):
2718 for cluster in local_clusters:
2719 if (net_type == cluster[0] and cluster_id == cluster[1]
2720 and nid == cluster[2]):
2724 def find_route(srv_list):
2726 frm_type = local_clusters[0][0]
2727 for srv in srv_list:
2728 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2729 to_type = srv.net_type
2731 cluster_id = srv.cluster_id
2732 debug ('looking for route to', to_type, to)
2733 for r in local_routes:
2734 debug("find_route: ", r)
2735 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2736 result.append((srv, r))
2739 def get_active_target(db):
2740 target_uuid = db.getUUID()
2741 target_name = db.getName()
2742 node_name = get_select(target_name)
2744 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2746 tgt_dev_uuid = db.get_first_ref('active')
2749 def get_server_by_nid_uuid(db, nid_uuid):
2750 for n in db.lookup_class("network"):
2752 if net.nid_uuid == nid_uuid:
2756 ############################################################
2760 type = db.get_class()
2761 debug('Service:', type, db.getName(), db.getUUID())
2766 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2767 elif type == 'network':
2769 elif type == 'routetbl':
2773 elif type == 'cobd':
2774 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2775 elif type == 'cmobd':
2777 elif type == 'mdsdev':
2779 elif type == 'mountpoint':
2781 elif type == 'echoclient':
2783 elif type == 'mgmt':
2788 panic ("unknown service type:", type)
2792 # Prepare the system to run lustre using a particular profile
2793 # in a the configuration.
2794 # * load & the modules
2795 # * setup networking for the current node
2796 # * make sure partitions are in place and prepared
2797 # * initialize devices with lctl
2798 # Levels is important, and needs to be enforced.
2799 def for_each_profile(db, prof_list, operation):
2800 for prof_uuid in prof_list:
2801 prof_db = db.lookup(prof_uuid)
2803 panic("profile:", profile, "not found.")
2804 services = getServices(prof_db)
2807 def magic_get_osc(db, rec, lov):
2809 lov_uuid = lov.get_uuid()
2810 lov_name = lov.osc.fs_name
2812 lov_uuid = rec.getAttribute('lov_uuidref')
2813 # FIXME: better way to find the mountpoint?
2814 filesystems = db.root_node.getElementsByTagName('filesystem')
2816 for fs in filesystems:
2817 ref = fs.getElementsByTagName('obd_ref')
2818 if ref[0].getAttribute('uuidref') == lov_uuid:
2819 fsuuid = fs.getAttribute('uuid')
2823 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2825 mtpts = db.root_node.getElementsByTagName('mountpoint')
2828 ref = fs.getElementsByTagName('filesystem_ref')
2829 if ref[0].getAttribute('uuidref') == fsuuid:
2830 lov_name = fs.getAttribute('name')
2834 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2836 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2838 ost_uuid = rec.getAttribute('ost_uuidref')
2839 obd = db.lookup(ost_uuid)
2842 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
2844 osc = get_osc(obd, lov_uuid, lov_name)
2846 panic('osc not found:', obd_uuid)
2849 # write logs for update records. sadly, logs of all types -- and updates in
2850 # particular -- are something of an afterthought. lconf needs rewritten with
2851 # these as core concepts. so this is a pretty big hack.
2852 def process_update_record(db, update, lov):
2853 for rec in update.childNodes:
2854 if rec.nodeType != rec.ELEMENT_NODE:
2857 log("found "+rec.nodeName+" record in update version " +
2858 str(update.getAttribute('version')))
2860 lov_uuid = rec.getAttribute('lov_uuidref')
2861 ost_uuid = rec.getAttribute('ost_uuidref')
2862 index = rec.getAttribute('index')
2863 gen = rec.getAttribute('generation')
2865 if not lov_uuid or not ost_uuid or not index or not gen:
2866 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
2869 tmplov = db.lookup(lov_uuid)
2871 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
2872 lov_name = tmplov.getName()
2874 lov_name = lov.osc.name
2876 # ------------------------------------------------------------- add
2877 if rec.nodeName == 'add':
2879 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2882 osc = magic_get_osc(db, rec, lov)
2885 # Only ignore connect failures with --force, which
2886 # isn't implemented here yet.
2887 osc.prepare(ignore_connect_failure=0)
2888 except CommandError, e:
2889 print "Error preparing OSC %s\n" % osc.uuid
2892 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2894 # ------------------------------------------------------ deactivate
2895 elif rec.nodeName == 'deactivate':
2899 osc = magic_get_osc(db, rec, lov)
2903 except CommandError, e:
2904 print "Error deactivating OSC %s\n" % osc.uuid
2907 # ---------------------------------------------------------- delete
2908 elif rec.nodeName == 'delete':
2912 osc = magic_get_osc(db, rec, lov)
2918 except CommandError, e:
2919 print "Error cleaning up OSC %s\n" % osc.uuid
2922 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2924 def process_updates(db, log_device, log_name, lov = None):
2925 updates = db.root_node.getElementsByTagName('update')
2927 if not u.childNodes:
2928 log("ignoring empty update record (version " +
2929 str(u.getAttribute('version')) + ")")
2932 version = u.getAttribute('version')
2933 real_name = "%s-%s" % (log_name, version)
2934 lctl.clear_log(log_device, real_name)
2935 lctl.record(log_device, real_name)
2937 process_update_record(db, u, lov)
2941 def doWriteconf(services):
2945 if s[1].get_class() == 'mdsdev':
2946 n = newService(s[1])
2949 def doSetup(services):
2954 n = newService(s[1])
2956 slist.append((n.level, n))
2959 nl = n[1].correct_level(n[0])
2960 nlist.append((nl, n[1]))
2965 def doModules(services):
2969 n = newService(s[1])
2972 def doCleanup(services):
2977 n = newService(s[1])
2979 slist.append((n.level, n))
2982 nl = n[1].correct_level(n[0])
2983 nlist.append((nl, n[1]))
2987 if n[1].safe_to_clean():
2990 def doUnloadModules(services):
2995 n = newService(s[1])
2996 if n.safe_to_clean_modules():
3001 def doHost(lustreDB, hosts):
3002 global is_router, local_node_name
3005 node_db = lustreDB.lookup_name(h, 'node')
3009 panic('No host entry found.')
3011 local_node_name = node_db.get_val('name', 0)
3012 is_router = node_db.get_val_int('router', 0)
3013 lustre_upcall = node_db.get_val('lustreUpcall', '')
3014 portals_upcall = node_db.get_val('portalsUpcall', '')
3015 timeout = node_db.get_val_int('timeout', 0)
3016 ptldebug = node_db.get_val('ptldebug', '')
3017 subsystem = node_db.get_val('subsystem', '')
3019 find_local_clusters(node_db)
3021 find_local_routes(lustreDB)
3023 # Two step process: (1) load modules, (2) setup lustre
3024 # if not cleaning, load modules first.
3025 prof_list = node_db.get_refs('profile')
3027 if config.write_conf:
3029 for_each_profile(node_db, prof_list, doModules)
3031 for_each_profile(node_db, prof_list, doWriteconf)
3032 for_each_profile(node_db, prof_list, doUnloadModules)
3034 elif config.recover:
3035 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3036 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3037 "--client_uuid <UUID> --conn_uuid <UUID>")
3038 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3040 elif config.cleanup:
3042 # the command line can override this value
3044 # ugly hack, only need to run lctl commands for --dump
3045 if config.lctl_dump or config.record:
3046 for_each_profile(node_db, prof_list, doCleanup)
3049 sys_set_timeout(timeout)
3050 sys_set_ptldebug(ptldebug)
3051 sys_set_subsystem(subsystem)
3052 sys_set_lustre_upcall(lustre_upcall)
3053 sys_set_portals_upcall(portals_upcall)
3055 for_each_profile(node_db, prof_list, doCleanup)
3056 for_each_profile(node_db, prof_list, doUnloadModules)
3060 # ugly hack, only need to run lctl commands for --dump
3061 if config.lctl_dump or config.record:
3062 sys_set_timeout(timeout)
3063 sys_set_lustre_upcall(lustre_upcall)
3064 for_each_profile(node_db, prof_list, doSetup)
3068 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3069 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3071 for_each_profile(node_db, prof_list, doModules)
3073 sys_set_debug_path()
3074 sys_set_ptldebug(ptldebug)
3075 sys_set_subsystem(subsystem)
3076 script = config.gdb_script
3077 run(lctl.lctl, ' modules >', script)
3079 log ("The GDB module script is in", script)
3080 # pause, so user has time to break and
3083 sys_set_timeout(timeout)
3084 sys_set_lustre_upcall(lustre_upcall)
3085 sys_set_portals_upcall(portals_upcall)
3087 for_each_profile(node_db, prof_list, doSetup)
3090 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3091 tgt = lustreDB.lookup(tgt_uuid)
3093 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3094 new_uuid = get_active_target(tgt)
3096 raise Lustre.LconfError("doRecovery: no active target found for: " +
3098 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3100 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3102 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3104 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3107 lctl.disconnect(oldnet)
3108 except CommandError, e:
3109 log("recover: disconnect", nid_uuid, "failed: ")
3114 except CommandError, e:
3115 log("recover: connect failed")
3118 lctl.recover(client_uuid, net.nid_uuid)
3121 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3122 base = os.path.dirname(cmd)
3123 if development_mode():
3124 if not config.lustre:
3125 debug('using objdir module paths')
3126 config.lustre = (os.path.join(base, ".."))
3127 # normalize the portals dir, using command line arg if set
3129 portals_dir = config.portals
3130 dir = os.path.join(config.lustre, portals_dir)
3131 config.portals = dir
3132 debug('config.portals', config.portals)
3133 elif config.lustre and config.portals:
3135 # if --lustre and --portals, normalize portals
3136 # can ignore POTRALS_DIR here, since it is probly useless here
3137 config.portals = os.path.join(config.lustre, config.portals)
3138 debug('config.portals B', config.portals)
3140 def sysctl(path, val):
3141 debug("+ sysctl", path, val)
3145 fp = open(os.path.join('/proc/sys', path), 'w')
3152 def sys_set_debug_path():
3153 sysctl('portals/debug_path', config.debug_path)
3155 def sys_set_lustre_upcall(upcall):
3156 # the command overrides the value in the node config
3157 if config.lustre_upcall:
3158 upcall = config.lustre_upcall
3160 upcall = config.upcall
3162 lctl.set_lustre_upcall(upcall)
3164 def sys_set_portals_upcall(upcall):
3165 # the command overrides the value in the node config
3166 if config.portals_upcall:
3167 upcall = config.portals_upcall
3169 upcall = config.upcall
3171 sysctl('portals/upcall', upcall)
3173 def sys_set_timeout(timeout):
3174 # the command overrides the value in the node config
3175 if config.timeout and config.timeout > 0:
3176 timeout = config.timeout
3177 if timeout != None and timeout > 0:
3178 lctl.set_timeout(timeout)
3180 def sys_tweak_socknal ():
3181 if config.single_socket:
3182 sysctl("socknal/typed", 0)
3184 def sys_optimize_elan ():
3185 procfiles = ["/proc/elan/config/eventint_punt_loops",
3186 "/proc/qsnet/elan3/config/eventint_punt_loops",
3187 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3189 if os.access(p, os.R_OK):
3190 run ("echo 1 > " + p)
3192 def sys_set_ptldebug(ptldebug):
3194 ptldebug = config.ptldebug
3197 val = eval(ptldebug, ptldebug_names)
3198 val = "0x%x" % (val)
3199 sysctl('portals/debug', val)
3200 except NameError, e:
3203 def sys_set_subsystem(subsystem):
3204 if config.subsystem:
3205 subsystem = config.subsystem
3208 val = eval(subsystem, subsystem_names)
3209 val = "0x%x" % (val)
3210 sysctl('portals/subsystem_debug', val)
3211 except NameError, e:
3214 def sys_set_netmem_max(path, max):
3215 debug("setting", path, "to at least", max)
3223 fp = open(path, 'w')
3224 fp.write('%d\n' %(max))
3228 def sys_make_devices():
3229 if not os.access('/dev/portals', os.R_OK):
3230 run('mknod /dev/portals c 10 240')
3231 if not os.access('/dev/obd', os.R_OK):
3232 run('mknod /dev/obd c 10 241')
3235 # Add dir to the global PATH, if not already there.
3236 def add_to_path(new_dir):
3237 syspath = string.split(os.environ['PATH'], ':')
3238 if new_dir in syspath:
3240 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3242 def default_debug_path():
3243 path = '/tmp/lustre-log'
3244 if os.path.isdir('/r'):
3249 def default_gdb_script():
3250 script = '/tmp/ogdb'
3251 if os.path.isdir('/r'):
3252 return '/r' + script
3257 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3258 # ensure basic elements are in the system path
3259 def sanitise_path():
3260 for dir in DEFAULT_PATH:
3263 # global hack for the --select handling
3265 def init_select(args):
3266 # args = [service=nodeA,service2=nodeB service3=nodeC]
3269 list = string.split(arg, ',')
3271 srv, node = string.split(entry, '=')
3272 tgt_select[srv] = node
3274 def get_select(srv):
3275 if tgt_select.has_key(srv):
3276 return tgt_select[srv]
3280 FLAG = Lustre.Options.FLAG
3281 PARAM = Lustre.Options.PARAM
3282 INTPARAM = Lustre.Options.INTPARAM
3283 PARAMLIST = Lustre.Options.PARAMLIST
3285 ('verbose,v', "Print system commands as they are run"),
3286 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3287 ('config', "Cluster config name used for LDAP query", PARAM),
3288 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3289 ('node', "Load config for <nodename>", PARAM),
3290 ('cleanup,d', "Cleans up config. (Shutdown)"),
3291 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3293 ('single_socket', "socknal option: only use one socket instead of bundle",
3295 ('failover',"""Used to shut down without saving state.
3296 This will allow this node to "give up" a service to a
3297 another node for failover purposes. This will not
3298 be a clean shutdown.""",
3300 ('gdb', """Prints message after creating gdb module script
3301 and sleeps for 5 seconds."""),
3302 ('noexec,n', """Prints the commands and steps that will be run for a
3303 config without executing them. This can used to check if a
3304 config file is doing what it should be doing"""),
3305 ('nomod', "Skip load/unload module step."),
3306 ('nosetup', "Skip device setup/cleanup step."),
3307 ('reformat', "Reformat all devices (without question)"),
3308 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3309 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3310 ('clientoptions', "Additional options for Lustre", PARAM),
3311 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3313 ('write_conf', "Save all the client config information on mds."),
3314 ('record', "Write config information on mds."),
3315 ('record_log', "Name of config record log.", PARAM),
3316 ('record_device', "MDS device name that will record the config commands",
3318 ('root_squash', "MDS squash root to appointed uid",
3320 ('no_root_squash', "Don't squash root for appointed nid",
3322 ('minlevel', "Minimum level of services to configure/cleanup",
3324 ('maxlevel', """Maximum level of services to configure/cleanup
3325 Levels are aproximatly like:
3330 70 - mountpoint, echo_client, osc, mdc, lov""",
3332 ('lustre', """Base directory of lustre sources. This parameter will
3333 cause lconf to load modules from a source tree.""", PARAM),
3334 ('portals', """Portals source directory. If this is a relative path,
3335 then it is assumed to be relative to lustre. """, PARAM),
3336 ('timeout', "Set recovery timeout", INTPARAM),
3337 ('upcall', "Set both portals and lustre upcall script", PARAM),
3338 ('lustre_upcall', "Set lustre upcall script", PARAM),
3339 ('portals_upcall', "Set portals upcall script", PARAM),
3340 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3341 ('ptldebug', "Set the portals debug level", PARAM),
3342 ('subsystem', "Set the portals debug subsystem", PARAM),
3343 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3344 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3345 # Client recovery options
3346 ('recover', "Recover a device"),
3347 ('group', "The group of devices to configure or cleanup", PARAM),
3348 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3349 ('client_uuid', "The failed client (required for recovery)", PARAM),
3350 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3352 ('inactive', """The name of an inactive service, to be ignored during
3353 mounting (currently OST-only). Can be repeated.""",
3358 global lctl, config, toplustreDB, CONFIG_FILE
3360 # in the upcall this is set to SIG_IGN
3361 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3363 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3365 config, args = cl.parse(sys.argv[1:])
3366 except Lustre.OptionError, e:
3370 setupModulePath(sys.argv[0])
3372 host = socket.gethostname()
3374 # the PRNG is normally seeded with time(), which is not so good for starting
3375 # time-synchronized clusters
3376 input = open('/dev/urandom', 'r')
3378 print 'Unable to open /dev/urandom!'
3380 seed = input.read(32)
3386 init_select(config.select)
3389 # allow config to be fetched via HTTP, but only with python2
3390 if sys.version[0] != '1' and args[0].startswith('http://'):
3393 config_file = urllib2.urlopen(args[0])
3394 except (urllib2.URLError, socket.error), err:
3395 if hasattr(err, 'args'):
3397 print "Could not access '%s': %s" %(args[0], err)
3399 elif not os.access(args[0], os.R_OK):
3400 print 'File not found or readable:', args[0]
3404 config_file = open(args[0], 'r')
3406 dom = xml.dom.minidom.parse(config_file)
3408 panic("%s does not appear to be a config file." % (args[0]))
3409 sys.exit(1) # make sure to die here, even in debug mode.
3411 CONFIG_FILE = args[0]
3412 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3413 if not config.config:
3414 config.config = os.path.basename(args[0])# use full path?
3415 if config.config[-4:] == '.xml':
3416 config.config = config.config[:-4]
3417 elif config.ldapurl:
3418 if not config.config:
3419 panic("--ldapurl requires --config name")
3420 dn = "config=%s,fs=lustre" % (config.config)
3421 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3422 elif config.ptldebug or config.subsystem:
3423 sys_set_ptldebug(None)
3424 sys_set_subsystem(None)
3427 print 'Missing config file or ldap URL.'
3428 print 'see lconf --help for command summary'
3431 toplustreDB = lustreDB
3433 ver = lustreDB.get_version()
3435 panic("No version found in config data, please recreate.")
3436 if ver != Lustre.CONFIG_VERSION:
3437 panic("Config version", ver, "does not match lconf version",
3438 Lustre.CONFIG_VERSION)
3442 node_list.append(config.node)
3445 node_list.append(host)
3446 node_list.append('localhost')
3448 debug("configuring for host: ", node_list)
3451 config.debug_path = config.debug_path + '-' + host
3452 config.gdb_script = config.gdb_script + '-' + host
3454 lctl = LCTLInterface('lctl')
3456 if config.lctl_dump:
3457 lctl.use_save_file(config.lctl_dump)
3460 if not (config.record_device and config.record_log):
3461 panic("When recording, both --record_log and --record_device must be specified.")
3462 lctl.clear_log(config.record_device, config.record_log)
3463 lctl.record(config.record_device, config.record_log)
3465 doHost(lustreDB, node_list)
3467 if not config.record:
3472 process_updates(lustreDB, config.record_device, config.record_log)
3474 if __name__ == "__main__":
3477 except Lustre.LconfError, e:
3479 # traceback.print_exc(file=sys.stdout)
3481 except CommandError, e:
3485 if first_cleanup_error:
3486 sys.exit(first_cleanup_error)