3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
106 "socknal" : (1 << 11),
107 "qswnal" : (1 << 12),
108 "pinger" : (1 << 13),
109 "filter" : (1 << 14),
115 "ptlrouter" : (1 << 20),
122 first_cleanup_error = 0
123 def cleanup_error(rc):
124 global first_cleanup_error
125 if not first_cleanup_error:
126 first_cleanup_error = rc
128 # ============================================================
129 # debugging and error funcs
131 def fixme(msg = "this feature"):
132 raise Lustre.LconfError, msg + ' not implemented yet.'
135 msg = string.join(map(str,args))
136 if not config.noexec:
137 raise Lustre.LconfError(msg)
142 msg = string.join(map(str,args))
147 print string.strip(s)
151 msg = string.join(map(str,args))
154 # ack, python's builtin int() does not support '0x123' syntax.
155 # eval can do it, although what a hack!
159 return eval(s, {}, {})
162 except SyntaxError, e:
163 raise ValueError("not a number")
165 raise ValueError("not a number")
167 # ============================================================
168 # locally defined exceptions
169 class CommandError (exceptions.Exception):
170 def __init__(self, cmd_name, cmd_err, rc=None):
171 self.cmd_name = cmd_name
172 self.cmd_err = cmd_err
177 if type(self.cmd_err) == types.StringType:
179 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
181 print "! %s: %s" % (self.cmd_name, self.cmd_err)
182 elif type(self.cmd_err) == types.ListType:
184 print "! %s (error %d):" % (self.cmd_name, self.rc)
186 print "! %s:" % (self.cmd_name)
187 for s in self.cmd_err:
188 print "> %s" %(string.strip(s))
193 # ============================================================
194 # handle daemons, like the acceptor
196 """ Manage starting and stopping a daemon. Assumes daemon manages
197 it's own pid file. """
199 def __init__(self, cmd):
205 log(self.command, "already running.")
207 self.path = find_prog(self.command)
209 panic(self.command, "not found.")
210 ret, out = runcmd(self.path +' '+ self.command_line())
212 raise CommandError(self.path, out, ret)
216 pid = self.read_pidfile()
218 log ("killing process", pid)
220 #time.sleep(1) # let daemon die
222 log("unable to kill", self.command, e)
224 log("unable to kill", self.command)
227 pid = self.read_pidfile()
237 def read_pidfile(self):
239 fp = open(self.pidfile(), 'r')
246 def clean_pidfile(self):
247 """ Remove a stale pidfile """
248 log("removing stale pidfile:", self.pidfile())
250 os.unlink(self.pidfile())
252 log(self.pidfile(), e)
254 class AcceptorHandler(DaemonHandler):
255 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
256 DaemonHandler.__init__(self, "acceptor")
259 self.send_mem = send_mem
260 self.recv_mem = recv_mem
263 self.flags = self.flags + ' -i'
266 return "/var/run/%s-%d.pid" % (self.command, self.port)
268 def command_line(self):
269 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
273 # start the acceptors
275 if config.lctl_dump or config.record:
277 for port in acceptors.keys():
278 daemon = acceptors[port]
279 if not daemon.running():
282 def run_one_acceptor(port):
283 if config.lctl_dump or config.record:
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
287 if not daemon.running():
290 panic("run_one_acceptor: No acceptor defined for port:", port)
292 def stop_acceptor(port):
293 if acceptors.has_key(port):
294 daemon = acceptors[port]
299 # ============================================================
300 # handle lctl interface
303 Manage communication with lctl
306 def __init__(self, cmd):
308 Initialize close by finding the lctl binary.
310 self.lctl = find_prog(cmd)
312 self.record_device = ''
315 debug('! lctl not found')
318 raise CommandError('lctl', "unable to find lctl binary.")
320 def use_save_file(self, file):
321 self.save_file = file
323 def record(self, dev_name, logname):
324 log("Recording log", logname, "on", dev_name)
325 self.record_device = dev_name
326 self.record_log = logname
328 def end_record(self):
329 log("End recording log", self.record_log, "on", self.record_device)
330 self.record_device = None
331 self.record_log = None
333 def set_nonblock(self, fd):
334 fl = fcntl.fcntl(fd, F_GETFL)
335 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
340 the cmds are written to stdin of lctl
341 lctl doesn't return errors when run in script mode, so
343 should modify command line to accept multiple commands, or
344 create complex command line options
348 cmds = '\n dump ' + self.save_file + '\n' + cmds
349 elif self.record_device:
353 %s""" % (self.record_device, self.record_log, cmds)
355 debug("+", cmd_line, cmds)
356 if config.noexec: return (0, [])
358 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
359 child.tochild.write(cmds + "\n")
360 child.tochild.close()
361 # print "LCTL:", cmds
363 # From "Python Cookbook" from O'Reilly
364 outfile = child.fromchild
365 outfd = outfile.fileno()
366 self.set_nonblock(outfd)
367 errfile = child.childerr
368 errfd = errfile.fileno()
369 self.set_nonblock(errfd)
371 outdata = errdata = ''
374 ready = select.select([outfd,errfd],[],[]) # Wait for input
375 if outfd in ready[0]:
376 outchunk = outfile.read()
377 if outchunk == '': outeof = 1
378 outdata = outdata + outchunk
379 if errfd in ready[0]:
380 errchunk = errfile.read()
381 if errchunk == '': erreof = 1
382 errdata = errdata + errchunk
383 if outeof and erreof: break
384 # end of "borrowed" code
387 if os.WIFEXITED(ret):
388 rc = os.WEXITSTATUS(ret)
391 if rc or len(errdata):
392 raise CommandError(self.lctl, errdata, rc)
395 def runcmd(self, *args):
397 run lctl using the command line
399 cmd = string.join(map(str,args))
400 debug("+", self.lctl, cmd)
401 rc, out = run(self.lctl, cmd)
403 raise CommandError(self.lctl, out, rc)
407 def clear_log(self, dev, log):
408 """ clear an existing log """
413 quit """ % (dev, log)
416 def network(self, net, nid):
421 quit """ % (net, nid)
424 def root_squash(self, name, uid, nid):
428 quit""" % (name, uid, nid)
431 # create a new connection
432 def add_uuid(self, net_type, uuid, nid):
433 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
436 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
438 if net_type in ('tcp',) and not config.lctl_dump:
443 add_autoconn %s %s %d %s
447 nid, hostaddr, port, flags )
450 def connect(self, srv):
451 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
452 if srv.net_type in ('tcp',) and not config.lctl_dump:
456 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
457 srv.nid, srv.hostaddr, srv.port, flags)
460 def recover(self, dev_name, new_conn):
463 recover %s""" %(dev_name, new_conn)
466 # add a route to a range
467 def add_route(self, net, gw, lo, hi):
475 except CommandError, e:
479 def del_route(self, net, gw, lo, hi):
484 quit """ % (net, gw, lo, hi)
487 # add a route to a host
488 def add_route_host(self, net, uuid, gw, tgt):
489 self.add_uuid(net, uuid, tgt)
497 except CommandError, e:
501 # add a route to a range
502 def del_route_host(self, net, uuid, gw, tgt):
508 quit """ % (net, gw, tgt)
512 def del_autoconn(self, net_type, nid, hostaddr):
513 if net_type in ('tcp',) and not config.lctl_dump:
522 # disconnect one connection
523 def disconnect(self, srv):
524 self.del_uuid(srv.nid_uuid)
525 if srv.net_type in ('tcp',) and not config.lctl_dump:
526 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
528 def del_uuid(self, uuid):
536 def disconnectAll(self, net):
544 def attach(self, type, name, uuid):
547 quit""" % (type, name, uuid)
550 def setup(self, name, setup = ""):
554 quit""" % (name, setup)
558 # create a new device with lctl
559 def newdev(self, type, name, uuid, setup = ""):
560 self.attach(type, name, uuid);
562 self.setup(name, setup)
563 except CommandError, e:
564 self.cleanup(name, uuid, 0)
569 def cleanup(self, name, uuid, force, failover = 0):
570 if failover: force = 1
576 quit""" % (name, ('', 'force')[force],
577 ('', 'failover')[failover])
581 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
582 stripe_sz, stripe_off, pattern):
585 lov_setup %s %d %d %d %s
586 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
589 # add an OBD to a LOV
590 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
592 lov_modify_tgts add %s %s %s %s
593 quit""" % (name, obd_uuid, index, gen)
597 def lmv_setup(self, name, uuid, desc_uuid, devlist):
601 quit""" % (name, uuid, desc_uuid, devlist)
604 # delete an OBD from a LOV
605 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
607 lov_modify_tgts del %s %s %s %s
608 quit""" % (name, obd_uuid, index, gen)
612 def deactivate(self, name):
620 def dump(self, dump_file):
623 quit""" % (dump_file)
626 # get list of devices
627 def device_list(self):
628 devices = '/proc/fs/lustre/devices'
630 if os.access(devices, os.R_OK):
632 fp = open(devices, 'r')
640 def lustre_version(self):
641 rc, out = self.runcmd('version')
645 def mount_option(self, profile, osc, mdc):
647 mount_option %s %s %s
648 quit""" % (profile, osc, mdc)
651 # delete mount options
652 def del_mount_option(self, profile):
658 def set_timeout(self, timeout):
664 def set_lustre_upcall(self, upcall):
669 # ============================================================
670 # Various system-level functions
671 # (ideally moved to their own module)
673 # Run a command and return the output and status.
674 # stderr is sent to /dev/null, could use popen3 to
675 # save it if necessary
678 if config.noexec: return (0, [])
679 f = os.popen(cmd + ' 2>&1')
689 cmd = string.join(map(str,args))
692 # Run a command in the background.
693 def run_daemon(*args):
694 cmd = string.join(map(str,args))
696 if config.noexec: return 0
697 f = os.popen(cmd + ' 2>&1')
705 # Determine full path to use for an external command
706 # searches dirname(argv[0]) first, then PATH
708 syspath = string.split(os.environ['PATH'], ':')
709 cmdpath = os.path.dirname(sys.argv[0])
710 syspath.insert(0, cmdpath);
712 syspath.insert(0, os.path.join(config.portals, 'utils/'))
714 prog = os.path.join(d,cmd)
715 if os.access(prog, os.X_OK):
719 # Recursively look for file starting at base dir
720 def do_find_file(base, mod):
721 fullname = os.path.join(base, mod)
722 if os.access(fullname, os.R_OK):
724 for d in os.listdir(base):
725 dir = os.path.join(base,d)
726 if os.path.isdir(dir):
727 module = do_find_file(dir, mod)
731 def find_module(src_dir, dev_dir, modname):
732 modbase = src_dir +'/'+ dev_dir +'/'+ modname
733 for modext in '.ko', '.o':
734 module = modbase + modext
736 if os.access(module, os.R_OK):
742 # is the path a block device?
749 return stat.S_ISBLK(s[stat.ST_MODE])
751 # build fs according to type
753 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
759 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
761 # devsize is in 1k, and fs block count is in 4k
762 block_cnt = devsize/4
764 if fstype in ('ext3', 'extN', 'ldiskfs'):
765 # ext3 journal size is in megabytes
768 if not is_block(dev):
769 ret, out = runcmd("ls -l %s" %dev)
770 devsize = int(string.split(out[0])[4]) / 1024
772 ret, out = runcmd("sfdisk -s %s" %dev)
773 devsize = int(out[0])
774 if devsize > 1024 * 1024:
775 jsize = ((devsize / 102400) * 4)
778 if jsize: jopt = "-J size=%d" %(jsize,)
779 if isize: iopt = "-I %d" %(isize,)
780 mkfs = 'mkfs.ext2 -j -b 4096 '
781 if not isblock or config.force:
783 elif fstype == 'reiserfs':
784 # reiserfs journal size is in blocks
785 if jsize: jopt = "--journal_size %d" %(jsize,)
786 mkfs = 'mkreiserfs -ff'
788 panic('unsupported fs type: ', fstype)
790 if config.mkfsoptions != None:
791 mkfs = mkfs + ' ' + config.mkfsoptions
792 if mkfsoptions != None:
793 mkfs = mkfs + ' ' + mkfsoptions
794 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
796 panic("Unable to build fs:", dev, string.join(out))
797 # enable hash tree indexing on fsswe
798 if fstype in ('ext3', 'extN', 'ldiskfs'):
799 htree = 'echo "feature FEATURE_C5" | debugfs -w'
800 (ret, out) = run (htree, dev)
802 panic("Unable to enable htree:", dev)
804 # some systems use /dev/loopN, some /dev/loop/N
808 if not os.access(loop + str(0), os.R_OK):
810 if not os.access(loop + str(0), os.R_OK):
811 panic ("can't access loop devices")
814 # find loop device assigned to the file
815 def find_assigned_loop(file):
817 for n in xrange(0, MAX_LOOP_DEVICES):
819 if os.access(dev, os.R_OK):
820 (stat, out) = run('losetup', dev)
821 if out and stat == 0:
822 m = re.search(r'\((.*)\)', out[0])
823 if m and file == m.group(1):
829 # create file if necessary and assign the first free loop device
830 def init_loop(file, size, fstype, journal_size, inode_size,
831 mkfsoptions, reformat, autoformat, backfstype, backfile):
834 realfstype = backfstype
835 if is_block(backfile):
836 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
837 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
843 dev = find_assigned_loop(realfile)
845 print 'WARNING file:', realfile, 'already mapped to', dev
848 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
850 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
851 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
853 panic("Unable to create backing store:", realfile)
855 mkfs(realfile, size, realfstype, journal_size, inode_size,
856 mkfsoptions, isblock=0)
859 # find next free loop
860 for n in xrange(0, MAX_LOOP_DEVICES):
862 if os.access(dev, os.R_OK):
863 (stat, out) = run('losetup', dev)
865 run('losetup', dev, realfile)
868 print "out of loop devices"
870 print "out of loop devices"
873 # undo loop assignment
874 def clean_loop(file):
875 dev = find_assigned_loop(file)
877 ret, out = run('losetup -d', dev)
879 log('unable to clean loop device:', dev, 'for file:', file)
882 # determine if dev is formatted as a <fstype> filesystem
883 def need_format(fstype, dev):
884 # FIXME don't know how to implement this
887 # initialize a block device if needed
888 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
889 inode_size, mkfsoptions, backfstype, backdev):
893 if fstype == 'smfs' or not is_block(dev):
894 dev = init_loop(dev, size, fstype, journal_size, inode_size,
895 mkfsoptions, reformat, autoformat, backfstype, backdev)
896 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
897 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
900 # panic("device:", dev,
901 # "not prepared, and autoformat is not set.\n",
902 # "Rerun with --reformat option to format ALL filesystems")
907 """lookup IP address for an interface"""
908 rc, out = run("/sbin/ifconfig", iface)
911 addr = string.split(out[1])[1]
912 ip = string.split(addr, ':')[1]
915 def def_mount_options(fstype, target):
916 """returns deafult mount options for passed fstype and target (mds, ost)"""
917 if fstype == 'ext3' or fstype == 'ldiskfs':
918 mountfsoptions = "errors=remount-ro"
919 if target == 'ost' and sys_get_branch() == '2.4':
920 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
921 return mountfsoptions
924 def sys_get_elan_position_file():
925 procfiles = ["/proc/elan/device0/position",
926 "/proc/qsnet/elan4/device0/position",
927 "/proc/qsnet/elan3/device0/position"]
929 if os.access(p, os.R_OK):
933 def sys_get_local_nid(net_type, wildcard, cluster_id):
934 """Return the local nid."""
936 if sys_get_elan_position_file():
937 local = sys_get_local_address('elan', '*', cluster_id)
939 local = sys_get_local_address(net_type, wildcard, cluster_id)
942 def sys_get_local_address(net_type, wildcard, cluster_id):
943 """Return the local address for the network type."""
945 if net_type in ('tcp',):
947 iface, star = string.split(wildcard, ':')
948 local = if2addr(iface)
950 panic ("unable to determine ip for:", wildcard)
952 host = socket.gethostname()
953 local = socket.gethostbyname(host)
954 elif net_type == 'elan':
955 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
956 f = sys_get_elan_position_file()
958 panic ("unable to determine local Elan ID")
961 lines = fp.readlines()
969 nid = my_int(cluster_id) + my_int(elan_id)
971 except ValueError, e:
975 elif net_type == 'gm':
976 fixme("automatic local address for GM")
980 def sys_get_branch():
981 """Returns kernel release"""
983 fp = open('/proc/sys/kernel/osrelease')
984 lines = fp.readlines()
988 version = string.split(l)
989 a = string.split(version[0], '.')
990 return a[0] + '.' + a[1]
996 def mod_loaded(modname):
997 """Check if a module is already loaded. Look in /proc/modules for it."""
999 fp = open('/proc/modules')
1000 lines = fp.readlines()
1002 # please forgive my tired fingers for this one
1003 ret = filter(lambda word, mod=modname: word == mod,
1004 map(lambda line: string.split(line)[0], lines))
1006 except Exception, e:
1009 # XXX: instead of device_list, ask for $name and see what we get
1010 def is_prepared(name):
1011 """Return true if a device exists for the name"""
1012 if config.lctl_dump:
1014 if (config.noexec or config.record) and config.cleanup:
1017 # expect this format:
1018 # 1 UP ldlm ldlm ldlm_UUID 2
1019 out = lctl.device_list()
1021 if name == string.split(s)[3]:
1023 except CommandError, e:
1027 def is_network_prepared():
1028 """If the any device exists, then assume that all networking
1029 has been configured"""
1030 out = lctl.device_list()
1033 def fs_is_mounted(path):
1034 """Return true if path is a mounted lustre filesystem"""
1036 fp = open('/proc/mounts')
1037 lines = fp.readlines()
1041 if a[1] == path and a[2] == 'lustre_lite':
1049 """Manage kernel modules"""
1050 def __init__(self, lustre_dir, portals_dir):
1051 self.lustre_dir = lustre_dir
1052 self.portals_dir = portals_dir
1053 self.kmodule_list = []
1055 def add_portals_module(self, dev_dir, modname):
1056 """Append a module to list of modules to load."""
1057 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1059 def add_lustre_module(self, dev_dir, modname):
1060 """Append a module to list of modules to load."""
1061 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1063 def load_module(self):
1064 """Load all the modules in the list in the order they appear."""
1065 for src_dir, dev_dir, mod in self.kmodule_list:
1066 if mod_loaded(mod) and not config.noexec:
1068 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1070 module = find_module(src_dir, dev_dir, mod)
1072 panic('module not found:', mod)
1073 (rc, out) = run('/sbin/insmod', module)
1075 raise CommandError('insmod', out, rc)
1077 (rc, out) = run('/sbin/modprobe', mod)
1079 raise CommandError('modprobe', out, rc)
1081 def cleanup_module(self):
1082 """Unload the modules in the list in reverse order."""
1083 rev = self.kmodule_list
1085 for src_dir, dev_dir, mod in rev:
1086 if not mod_loaded(mod) and not config.noexec:
1089 if mod == 'portals' and config.dump:
1090 lctl.dump(config.dump)
1091 log('unloading module:', mod)
1092 (rc, out) = run('/sbin/rmmod', mod)
1094 log('! unable to unload module:', mod)
1097 # ============================================================
1098 # Classes to prepare and cleanup the various objects
1101 """ Base class for the rest of the modules. The default cleanup method is
1102 defined here, as well as some utilitiy funcs.
1104 def __init__(self, module_name, db):
1106 self.module_name = module_name
1107 self.name = self.db.getName()
1108 self.uuid = self.db.getUUID()
1111 self.kmod = kmod(config.lustre, config.portals)
1113 def info(self, *args):
1114 msg = string.join(map(str,args))
1115 print self.module_name + ":", self.name, self.uuid, msg
1118 """ default cleanup, used for most modules """
1121 lctl.cleanup(self.name, self.uuid, config.force)
1122 except CommandError, e:
1123 log(self.module_name, "cleanup failed: ", self.name)
1127 def add_portals_module(self, dev_dir, modname):
1128 """Append a module to list of modules to load."""
1129 self.kmod.add_portals_module(dev_dir, modname)
1131 def add_lustre_module(self, dev_dir, modname):
1132 """Append a module to list of modules to load."""
1133 self.kmod.add_lustre_module(dev_dir, modname)
1135 def load_module(self):
1136 """Load all the modules in the list in the order they appear."""
1137 self.kmod.load_module()
1139 def cleanup_module(self):
1140 """Unload the modules in the list in reverse order."""
1141 if self.safe_to_clean():
1142 self.kmod.cleanup_module()
1144 def safe_to_clean(self):
1147 def safe_to_clean_modules(self):
1148 return self.safe_to_clean()
1150 class Network(Module):
1151 def __init__(self,db):
1152 Module.__init__(self, 'NETWORK', db)
1153 self.net_type = self.db.get_val('nettype')
1154 self.nid = self.db.get_val('nid', '*')
1155 self.cluster_id = self.db.get_val('clusterid', "0")
1156 self.port = self.db.get_val_int('port', 0)
1157 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1158 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1159 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1162 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1164 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1165 self.generic_nid = 1
1166 debug("nid:", self.nid)
1168 self.generic_nid = 0
1170 self.nid_uuid = self.nid_to_uuid(self.nid)
1172 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1173 if '*' in self.hostaddr:
1174 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1175 if not self.hostaddr:
1176 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1177 debug("hostaddr:", self.hostaddr)
1179 self.add_portals_module("libcfs", 'libcfs')
1180 self.add_portals_module("portals", 'portals')
1181 if node_needs_router():
1182 self.add_portals_module("router", 'kptlrouter')
1183 if self.net_type == 'tcp':
1184 self.add_portals_module("knals/socknal", 'ksocknal')
1185 if self.net_type == 'elan':
1186 self.add_portals_module("knals/qswnal", 'kqswnal')
1187 if self.net_type == 'gm':
1188 self.add_portals_module("knals/gmnal", 'kgmnal')
1190 def nid_to_uuid(self, nid):
1191 return "NID_%s_UUID" %(nid,)
1194 if not config.record and is_network_prepared():
1196 self.info(self.net_type, self.nid, self.port)
1197 if not (config.record and self.generic_nid):
1198 lctl.network(self.net_type, self.nid)
1199 if self.net_type == 'tcp':
1201 if self.net_type == 'elan':
1203 if self.port and node_is_router():
1204 run_one_acceptor(self.port)
1205 self.connect_peer_gateways()
1207 def connect_peer_gateways(self):
1208 for router in self.db.lookup_class('node'):
1209 if router.get_val_int('router', 0):
1210 for netuuid in router.get_networks():
1211 net = self.db.lookup(netuuid)
1213 if (gw.cluster_id == self.cluster_id and
1214 gw.net_type == self.net_type):
1215 if gw.nid != self.nid:
1218 def disconnect_peer_gateways(self):
1219 for router in self.db.lookup_class('node'):
1220 if router.get_val_int('router', 0):
1221 for netuuid in router.get_networks():
1222 net = self.db.lookup(netuuid)
1224 if (gw.cluster_id == self.cluster_id and
1225 gw.net_type == self.net_type):
1226 if gw.nid != self.nid:
1229 except CommandError, e:
1230 print "disconnect failed: ", self.name
1234 def safe_to_clean(self):
1235 return not is_network_prepared()
1238 self.info(self.net_type, self.nid, self.port)
1240 stop_acceptor(self.port)
1241 if node_is_router():
1242 self.disconnect_peer_gateways()
1244 def correct_level(self, level, op=None):
1247 class RouteTable(Module):
1248 def __init__(self,db):
1249 Module.__init__(self, 'ROUTES', db)
1251 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1253 # only setup connections for tcp NALs
1255 if not net_type in ('tcp',):
1258 # connect to target if route is to single node and this node is the gw
1259 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1260 if not local_cluster(net_type, tgt_cluster_id):
1261 panic("target", lo, " not on the local cluster")
1262 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1263 # connect to gateway if this node is not the gw
1264 elif (local_cluster(net_type, gw_cluster_id)
1265 and not local_interface(net_type, gw_cluster_id, gw)):
1266 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1271 panic("no server for nid", lo)
1274 return Network(srvdb)
1277 if not config.record and is_network_prepared():
1280 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1281 lctl.add_route(net_type, gw, lo, hi)
1282 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1286 def safe_to_clean(self):
1287 return not is_network_prepared()
1290 if is_network_prepared():
1291 # the network is still being used, don't clean it up
1293 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1294 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1297 lctl.disconnect(srv)
1298 except CommandError, e:
1299 print "disconnect failed: ", self.name
1304 lctl.del_route(net_type, gw, lo, hi)
1305 except CommandError, e:
1306 print "del_route failed: ", self.name
1310 class Management(Module):
1311 def __init__(self, db):
1312 Module.__init__(self, 'MGMT', db)
1313 self.add_lustre_module('lvfs', 'lvfs')
1314 self.add_lustre_module('obdclass', 'obdclass')
1315 self.add_lustre_module('ptlrpc', 'ptlrpc')
1316 self.add_lustre_module('mgmt', 'mgmt_svc')
1319 if not config.record and is_prepared(self.name):
1322 lctl.newdev("mgmt", self.name, self.uuid)
1324 def safe_to_clean(self):
1328 if is_prepared(self.name):
1329 Module.cleanup(self)
1331 def correct_level(self, level, op=None):
1334 # This is only needed to load the modules; the LDLM device
1335 # is now created automatically.
1337 def __init__(self,db):
1338 Module.__init__(self, 'LDLM', db)
1339 self.add_lustre_module('lvfs', 'lvfs')
1340 self.add_lustre_module('obdclass', 'obdclass')
1341 self.add_lustre_module('ptlrpc', 'ptlrpc')
1349 def correct_level(self, level, op=None):
1354 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1355 Module.__init__(self, 'LOV', db)
1356 if name_override != None:
1357 self.name = "lov_%s" % name_override
1358 self.add_lustre_module('lov', 'lov')
1359 self.mds_uuid = self.db.get_first_ref('mds')
1360 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1361 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1362 self.pattern = self.db.get_val_int('stripepattern', 0)
1363 self.devlist = self.db.get_lov_tgts('lov_tgt')
1364 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1366 self.desc_uuid = self.uuid
1367 self.uuid = generate_client_uuid(self.name)
1368 self.fs_name = fs_name
1370 self.config_only = 1
1372 self.config_only = None
1373 mds = self.db.lookup(self.mds_uuid)
1374 self.mds_name = mds.getName()
1375 for (obd_uuid, index, gen, active) in self.devlist:
1378 obd = self.db.lookup(obd_uuid)
1379 osc = get_osc(obd, self.uuid, fs_name)
1381 self.osclist.append((osc, index, gen, active))
1383 panic('osc not found:', obd_uuid)
1389 if not config.record and is_prepared(self.name):
1391 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1392 self.stripe_off, self.pattern, self.devlist,
1394 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1395 self.stripe_sz, self.stripe_off, self.pattern)
1396 for (osc, index, gen, active) in self.osclist:
1397 target_uuid = osc.target_uuid
1399 # Only ignore connect failures with --force, which
1400 # isn't implemented here yet.
1402 osc.prepare(ignore_connect_failure=0)
1403 except CommandError, e:
1404 print "Error preparing OSC %s\n" % osc.uuid
1406 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1409 for (osc, index, gen, active) in self.osclist:
1410 target_uuid = osc.target_uuid
1412 if is_prepared(self.name):
1413 Module.cleanup(self)
1414 if self.config_only:
1415 panic("Can't clean up config_only LOV ", self.name)
1417 def load_module(self):
1418 if self.config_only:
1419 panic("Can't load modules for config_only LOV ", self.name)
1420 for (osc, index, gen, active) in self.osclist:
1423 Module.load_module(self)
1425 def cleanup_module(self):
1426 if self.config_only:
1427 panic("Can't cleanup modules for config_only LOV ", self.name)
1428 Module.cleanup_module(self)
1429 for (osc, index, gen, active) in self.osclist:
1431 osc.cleanup_module()
1434 def correct_level(self, level, op=None):
1438 def __init__(self, db, uuid, fs_name, name_override = None):
1439 Module.__init__(self, 'LMV', db)
1440 if name_override != None:
1441 self.name = "lmv_%s" % name_override
1442 self.add_lustre_module('lmv', 'lmv')
1443 self.devlist = self.db.get_refs('mds')
1445 self.desc_uuid = self.uuid
1447 self.fs_name = fs_name
1448 for mds_uuid in self.devlist:
1449 mds = self.db.lookup(mds_uuid)
1451 panic("MDS not found!")
1452 mdc = MDC(mds, self.uuid, fs_name)
1454 self.mdclist.append(mdc)
1456 panic('mdc not found:', mds_uuid)
1459 if is_prepared(self.name):
1461 for mdc in self.mdclist:
1463 # Only ignore connect failures with --force, which
1464 # isn't implemented here yet.
1465 mdc.prepare(ignore_connect_failure=0)
1466 except CommandError, e:
1467 print "Error preparing LMV %s\n" % mdc.uuid
1469 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1470 string.join(self.devlist))
1473 for mdc in self.mdclist:
1475 if is_prepared(self.name):
1476 Module.cleanup(self)
1478 def load_module(self):
1479 for mdc in self.mdclist:
1482 Module.load_module(self)
1484 def cleanup_module(self):
1485 Module.cleanup_module(self)
1486 for mdc in self.mdclist:
1487 mdc.cleanup_module()
1490 def correct_level(self, level, op=None):
1493 class MDSDEV(Module):
1494 def __init__(self,db):
1495 Module.__init__(self, 'MDSDEV', db)
1496 self.devpath = self.db.get_val('devpath','')
1497 self.backdevpath = self.db.get_val('backdevpath','')
1498 self.size = self.db.get_val_int('devsize', 0)
1499 self.journal_size = self.db.get_val_int('journalsize', 0)
1500 self.fstype = self.db.get_val('fstype', '')
1501 self.backfstype = self.db.get_val('backfstype', '')
1502 self.nspath = self.db.get_val('nspath', '')
1503 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1504 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1505 self.root_squash = self.db.get_val('root_squash', '')
1506 self.no_root_squash = self.db.get_val('no_root_squash', '')
1507 self.cachetype = self.db.get_val('cachetype', '')
1508 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1509 target_uuid = self.db.get_first_ref('target')
1510 mds = self.db.lookup(target_uuid)
1511 self.name = mds.getName()
1512 self.filesystem_uuids = mds.get_refs('filesystem')
1515 self.master_mds = ""
1516 if not self.filesystem_uuids:
1517 self.lmv_uuid = self.db.get_first_ref('lmv')
1518 if not self.lmv_uuid:
1519 panic("ALERT: can't find lvm uuid")
1521 self.lmv = self.db.lookup(self.lmv_uuid)
1523 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1524 self.master_mds = self.lmv_uuid
1525 # FIXME: if fstype not set, then determine based on kernel version
1526 self.format = self.db.get_val('autoformat', "no")
1527 if mds.get_val('failover', 0):
1528 self.failover_mds = 'f'
1530 self.failover_mds = 'n'
1531 active_uuid = get_active_target(mds)
1533 panic("No target device found:", target_uuid)
1534 if active_uuid == self.uuid:
1538 if self.active and config.group and config.group != mds.get_val('group'):
1541 self.inode_size = self.db.get_val_int('inodesize', 0)
1542 if self.inode_size == 0:
1543 # find the LOV for this MDS
1544 lovconfig_uuid = mds.get_first_ref('lovconfig')
1545 if not lovconfig_uuid:
1546 if not self.lmv_uuid:
1547 panic("No LOV found for lovconfig ", lovconfig.name)
1550 panic("No LMV initialized and not lovconfig_uuid found")
1552 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1553 lovconfig = self.lmv.lookup(lovconfig_uuid)
1554 lov_uuid = lovconfig.get_first_ref('lov')
1556 panic("No LOV found for lovconfig ", lovconfig.name)
1558 lovconfig = mds.lookup(lovconfig_uuid)
1559 lov_uuid = lovconfig.get_first_ref('lov')
1561 panic("No LOV found for lovconfig ", lovconfig.name)
1564 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1565 lovconfig = self.lmv.lookup(lovconfig_uuid)
1566 lov_uuid = lovconfig.get_first_ref('lov')
1568 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1570 # default stripe count controls default inode_size
1571 stripe_count = lov.stripe_cnt
1572 if stripe_count > 77:
1573 self.inode_size = 4096
1574 elif stripe_count > 35:
1575 self.inode_size = 2048
1576 elif stripe_count > 13:
1577 self.inode_size = 1024
1578 elif stripe_count > 3:
1579 self.inode_size = 512
1581 self.inode_size = 256
1583 self.target_dev_uuid = self.uuid
1584 self.uuid = target_uuid
1587 client_uuid = generate_client_uuid(self.name)
1588 client_uuid = self.name + "_lmv_" + "UUID"
1589 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1590 self.master_mds = self.master.name
1593 self.add_lustre_module('mdc', 'mdc')
1594 self.add_lustre_module('osc', 'osc')
1595 self.add_lustre_module('lov', 'lov')
1596 self.add_lustre_module('lmv', 'lmv')
1597 self.add_lustre_module('ost', 'ost')
1598 self.add_lustre_module('mds', 'mds')
1600 if self.fstype == 'smfs':
1601 self.add_lustre_module('smfs', 'smfs')
1603 if self.fstype == 'ldiskfs':
1604 self.add_lustre_module('ldiskfs', 'ldiskfs')
1607 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1609 # if fstype is smfs, then we should also take care about backing
1611 if self.fstype == 'smfs':
1612 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1614 for options in string.split(self.mountfsoptions, ','):
1615 if options == 'snap':
1616 if not self.fstype == 'smfs':
1617 panic("mountoptions with snap, but fstype is not smfs\n")
1618 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1619 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1620 def load_module(self):
1622 Module.load_module(self)
1625 if not config.record and is_prepared(self.name):
1628 debug(self.uuid, "not active")
1631 # run write_conf automatically, if --reformat used
1633 self.info(self.devpath, self.fstype, self.size, self.format)
1637 self.master.prepare()
1638 # never reformat here
1639 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1640 self.format, self.journal_size, self.inode_size,
1641 self.mkfsoptions, self.backfstype, self.backdevpath)
1643 if not is_prepared('MDT'):
1644 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1646 mountfsoptions = def_mount_options(self.fstype, 'mds')
1648 if config.mountfsoptions:
1650 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1652 mountfsoptions = config.mountfsoptions
1653 if self.mountfsoptions:
1654 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1656 if self.mountfsoptions:
1658 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1660 mountfsoptions = self.mountfsoptions
1662 if self.fstype == 'smfs':
1663 realdev = self.fstype
1666 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1670 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1675 print 'MDS mount options: ' + mountfsoptions
1677 if not self.master_mds:
1678 self.master_mds = 'dumb'
1679 if not self.cachetype:
1680 self.cachetype = 'dumb'
1681 lctl.newdev("mds", self.name, self.uuid,
1682 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1683 self.name, mountfsoptions,
1684 self.master_mds, self.cachetype))
1686 if development_mode():
1687 procentry = "/proc/fs/lustre/mds/grp_hash_upcall"
1688 upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/l_getgroups")
1689 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
1690 print "MDS Warning: failed to set group-hash upcall"
1692 run("echo ", upcall, " > ", procentry)
1694 except CommandError, e:
1696 panic("MDS is missing the config log. Need to run " +
1697 "lconf --write_conf.")
1701 if config.root_squash == None:
1702 config.root_squash = self.root_squash
1703 if config.no_root_squash == None:
1704 config.no_root_squash = self.no_root_squash
1705 if config.root_squash:
1706 if config.no_root_squash:
1707 nsnid = config.no_root_squash
1710 lctl.root_squash(self.name, config.root_squash, nsnid)
1712 def write_conf(self):
1714 if not is_prepared(self.name):
1715 self.info(self.devpath, self.fstype, self.format)
1717 blkdev = block_dev(self.devpath, self.size, self.fstype,
1718 config.reformat, self.format, self.journal_size,
1719 self.inode_size, self.mkfsoptions,
1720 self.backfstype, self.backdevpath)
1722 # Even for writing logs we mount mds with supplied mount options
1723 # because it will not mount smfs (if used) otherwise.
1725 mountfsoptions = def_mount_options(self.fstype, 'mds')
1727 if config.mountfsoptions:
1729 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1731 mountfsoptions = config.mountfsoptions
1732 if self.mountfsoptions:
1733 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1735 if self.mountfsoptions:
1737 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1739 mountfsoptions = self.mountfsoptions
1741 if self.fstype == 'smfs':
1742 realdev = self.fstype
1745 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1749 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1754 print 'MDS mount options: ' + mountfsoptions
1756 # As mount options are passed by 4th param to config tool, we need
1757 # to pass something in 3rd param. But we do not want this 3rd param
1758 # be counted as a profile name for reading log on MDS setup, thus,
1759 # we pass there some predefined sign like 'dumb', which will be
1760 # checked in MDS code and skipped. Probably there is more nice way
1761 # like pass empty string and check it in config tool and pass null
1763 lctl.newdev("mds", self.name, self.uuid,
1764 setup ="%s %s %s %s" %(realdev, self.fstype,
1765 'dumb', mountfsoptions))
1768 # record logs for the MDS lov
1769 for uuid in self.filesystem_uuids:
1770 log("recording clients for filesystem:", uuid)
1771 fs = self.db.lookup(uuid)
1773 # this is ugly, should be organized nice later.
1774 target_uuid = self.db.get_first_ref('target')
1775 mds = self.db.lookup(target_uuid)
1777 lovconfig_uuid = mds.get_first_ref('lovconfig')
1779 lovconfig = mds.lookup(lovconfig_uuid)
1780 obd_uuid = lovconfig.get_first_ref('lov')
1782 obd_uuid = fs.get_first_ref('obd')
1784 client_uuid = generate_client_uuid(self.name)
1785 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1788 lctl.clear_log(self.name, self.name)
1789 lctl.record(self.name, self.name)
1791 lctl.mount_option(self.name, client.get_name(), "")
1793 process_updates(self.db, self.name, self.name, client)
1796 lctl.clear_log(self.name, self.name + '-clean')
1797 lctl.record(self.name, self.name + '-clean')
1799 lctl.del_mount_option(self.name)
1801 process_updates(self.db, self.name, self.name + '-clean', client)
1805 # record logs for each client
1811 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1813 config_options = CONFIG_FILE
1815 for node_db in self.db.lookup_class('node'):
1816 client_name = node_db.getName()
1817 for prof_uuid in node_db.get_refs('profile'):
1818 prof_db = node_db.lookup(prof_uuid)
1819 # refactor this into a funtion to test "clientness"
1821 for ref_class, ref_uuid in prof_db.get_all_refs():
1822 if ref_class in ('mountpoint','echoclient'):
1823 debug("recording", client_name)
1824 old_noexec = config.noexec
1826 ret, out = run (sys.argv[0], noexec_opt,
1827 " -v --record --nomod",
1828 "--record_log", client_name,
1829 "--record_device", self.name,
1830 "--node", client_name,
1833 for s in out: log("record> ", string.strip(s))
1834 ret, out = run (sys.argv[0], noexec_opt,
1835 "--cleanup -v --record --nomod",
1836 "--record_log", client_name + "-clean",
1837 "--record_device", self.name,
1838 "--node", client_name,
1841 for s in out: log("record> ", string.strip(s))
1842 config.noexec = old_noexec
1845 lctl.cleanup(self.name, self.uuid, 0, 0)
1846 except CommandError, e:
1847 log(self.module_name, "cleanup failed: ", self.name)
1850 Module.cleanup(self)
1852 if self.fstype == 'smfs':
1853 clean_loop(self.backdevpath)
1855 clean_loop(self.devpath)
1857 def msd_remaining(self):
1858 out = lctl.device_list()
1860 if string.split(s)[2] in ('mds',):
1863 def safe_to_clean(self):
1866 def safe_to_clean_modules(self):
1867 return not self.msd_remaining()
1871 debug(self.uuid, "not active")
1874 if is_prepared(self.name):
1876 lctl.cleanup(self.name, self.uuid, config.force,
1878 except CommandError, e:
1879 log(self.module_name, "cleanup failed: ", self.name)
1882 Module.cleanup(self)
1885 self.master.cleanup()
1886 if not self.msd_remaining() and is_prepared('MDT'):
1888 lctl.cleanup("MDT", "MDT_UUID", config.force,
1890 except CommandError, e:
1891 print "cleanup failed: ", self.name
1895 if self.fstype == 'smfs':
1896 clean_loop(self.backdevpath)
1898 clean_loop(self.devpath)
1900 def correct_level(self, level, op=None):
1901 #if self.master_mds:
1906 def __init__(self, db):
1907 Module.__init__(self, 'OSD', db)
1908 self.osdtype = self.db.get_val('osdtype')
1909 self.devpath = self.db.get_val('devpath', '')
1910 self.backdevpath = self.db.get_val('backdevpath', '')
1911 self.size = self.db.get_val_int('devsize', 0)
1912 self.journal_size = self.db.get_val_int('journalsize', 0)
1913 self.inode_size = self.db.get_val_int('inodesize', 0)
1914 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1915 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1916 self.fstype = self.db.get_val('fstype', '')
1917 self.backfstype = self.db.get_val('backfstype', '')
1918 self.nspath = self.db.get_val('nspath', '')
1919 target_uuid = self.db.get_first_ref('target')
1920 ost = self.db.lookup(target_uuid)
1921 self.name = ost.getName()
1922 self.format = self.db.get_val('autoformat', 'yes')
1923 if ost.get_val('failover', 0):
1924 self.failover_ost = 'f'
1926 self.failover_ost = 'n'
1928 active_uuid = get_active_target(ost)
1930 panic("No target device found:", target_uuid)
1931 if active_uuid == self.uuid:
1935 if self.active and config.group and config.group != ost.get_val('group'):
1938 self.target_dev_uuid = self.uuid
1939 self.uuid = target_uuid
1941 self.add_lustre_module('ost', 'ost')
1942 if self.fstype == 'smfs':
1943 self.add_lustre_module('smfs', 'smfs')
1944 # FIXME: should we default to ext3 here?
1945 if self.fstype == 'ldiskfs':
1946 self.add_lustre_module('ldiskfs', 'ldiskfs')
1948 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1949 if self.fstype == 'smfs':
1950 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1952 for options in self.mountfsoptions:
1953 if options == 'snap':
1954 if not self.fstype == 'smfs':
1955 panic("mountoptions with snap, but fstype is not smfs\n")
1956 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1957 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1959 self.add_lustre_module(self.osdtype, self.osdtype)
1961 def load_module(self):
1963 Module.load_module(self)
1965 # need to check /proc/mounts and /etc/mtab before
1966 # formatting anything.
1967 # FIXME: check if device is already formatted.
1969 if is_prepared(self.name):
1972 debug(self.uuid, "not active")
1974 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1975 self.format, self.journal_size, self.inode_size)
1977 if self.osdtype == 'obdecho':
1980 blkdev = block_dev(self.devpath, self.size, self.fstype,
1981 config.reformat, self.format, self.journal_size,
1982 self.inode_size, self.mkfsoptions, self.backfstype,
1985 mountfsoptions = def_mount_options(self.fstype, 'ost')
1987 if config.mountfsoptions:
1989 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1991 mountfsoptions = config.mountfsoptions
1992 if self.mountfsoptions:
1993 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1995 if self.mountfsoptions:
1997 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1999 mountfsoptions = self.mountfsoptions
2001 if self.fstype == 'smfs':
2002 realdev = self.fstype
2005 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
2009 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
2014 print 'OSD mount options: ' + mountfsoptions
2016 lctl.newdev(self.osdtype, self.name, self.uuid,
2017 setup ="%s %s %s %s" %(realdev, self.fstype,
2020 if not is_prepared('OSS'):
2021 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
2023 def osd_remaining(self):
2024 out = lctl.device_list()
2026 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2029 def safe_to_clean(self):
2032 def safe_to_clean_modules(self):
2033 return not self.osd_remaining()
2037 debug(self.uuid, "not active")
2039 if is_prepared(self.name):
2042 lctl.cleanup(self.name, self.uuid, config.force,
2044 except CommandError, e:
2045 log(self.module_name, "cleanup failed: ", self.name)
2048 if not self.osd_remaining() and is_prepared('OSS'):
2050 lctl.cleanup("OSS", "OSS_UUID", config.force,
2052 except CommandError, e:
2053 print "cleanup failed: ", self.name
2056 if not self.osdtype == 'obdecho':
2057 if self.fstype == 'smfs':
2058 clean_loop(self.backdevpath)
2060 clean_loop(self.devpath)
2062 def correct_level(self, level, op=None):
2065 def mgmt_uuid_for_fs(mtpt_name):
2068 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2069 fs_uuid = mtpt_db.get_first_ref('filesystem')
2070 fs = toplustreDB.lookup(fs_uuid)
2073 return fs.get_first_ref('mgmt')
2075 # Generic client module, used by OSC and MDC
2076 class Client(Module):
2077 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2079 self.target_name = tgtdb.getName()
2080 self.target_uuid = tgtdb.getUUID()
2084 self.tgt_dev_uuid = get_active_target(tgtdb)
2085 if not self.tgt_dev_uuid:
2086 panic("No target device found for target(1):", self.target_name)
2088 self.kmod = kmod(config.lustre, config.portals)
2092 self.module = module
2093 self.module_name = string.upper(module)
2095 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2096 self.target_name, fs_name)
2098 self.name = self_name
2100 self.lookup_server(self.tgt_dev_uuid)
2101 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2103 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2106 self.fs_name = fs_name
2109 self.add_lustre_module(module_dir, module)
2111 def lookup_server(self, srv_uuid):
2112 """ Lookup a server's network information """
2113 self._server_nets = get_ost_net(self.db, srv_uuid)
2114 if len(self._server_nets) == 0:
2115 panic ("Unable to find a server for:", srv_uuid)
2118 def get_servers(self):
2119 return self._server_nets
2121 def prepare(self, ignore_connect_failure = 0):
2122 self.info(self.target_uuid)
2123 if not config.record and is_prepared(self.name):
2126 srv = choose_local_server(self.get_servers())
2130 routes = find_route(self.get_servers())
2131 if len(routes) == 0:
2132 panic ("no route to", self.target_uuid)
2133 for (srv, r) in routes:
2134 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2135 except CommandError, e:
2136 if not ignore_connect_failure:
2139 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2140 debug("%s inactive" % self.target_uuid)
2141 inactive_p = "inactive"
2143 debug("%s active" % self.target_uuid)
2145 lctl.newdev(self.module, self.name, self.uuid,
2146 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2147 inactive_p, self.mgmt_name))
2150 if is_prepared(self.name):
2151 Module.cleanup(self)
2153 srv = choose_local_server(self.get_servers())
2155 lctl.disconnect(srv)
2157 for (srv, r) in find_route(self.get_servers()):
2158 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2159 except CommandError, e:
2160 log(self.module_name, "cleanup failed: ", self.name)
2164 def correct_level(self, level, op=None):
2167 def deactivate(self):
2169 lctl.deactivate(self.name)
2170 except CommandError, e:
2171 log(self.module_name, "deactivate failed: ", self.name)
2176 def __init__(self, db, uuid, fs_name):
2177 Client.__init__(self, db, uuid, 'mdc', fs_name)
2179 def permits_inactive(self):
2183 def __init__(self, db, uuid, fs_name):
2184 Client.__init__(self, db, uuid, 'osc', fs_name)
2186 def permits_inactive(self):
2189 def mgmtcli_name_for_uuid(uuid):
2190 return 'MGMTCLI_%s' % uuid
2192 class ManagementClient(Client):
2193 def __init__(self, db, uuid):
2194 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2195 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2196 module_dir = 'mgmt')
2198 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
2199 Module.__init__(self, 'VLOV', db)
2200 if name_override != None:
2201 self.name = "lov_%s" % name_override
2202 self.add_lustre_module('lov', 'lov')
2203 self.stripe_sz = 65536
2207 self.desc_uuid = self.uuid
2208 self.uuid = generate_client_uuid(self.name)
2209 self.fs_name = fs_name
2210 self.osc = get_osc(db, self.uuid, fs_name)
2212 panic('osc not found:', self.uuid)
2214 self.config_only = 1
2216 self.config_only = None
2222 if not config.record and is_prepared(self.name):
2224 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
2225 self.stripe_sz, self.stripe_off, self.pattern)
2226 target_uuid = self.osc.target_uuid
2229 self.osc.prepare(ignore_connect_failure=0)
2230 except CommandError, e:
2231 print "Error preparing OSC %s\n" % osc.uuid
2233 lctl.lov_add_obd(self.name, self.uuid, target_uuid, 0, 1)
2236 target_uuid = self.osc.target_uuid
2238 if is_prepared(self.name):
2239 Module.cleanup(self)
2240 if self.config_only:
2241 panic("Can't clean up config_only LOV ", self.name)
2243 def load_module(self):
2244 if self.config_only:
2245 panic("Can't load modules for config_only LOV ", self.name)
2246 self.osc.load_module()
2247 Module.load_module(self)
2249 def cleanup_module(self):
2250 if self.config_only:
2251 panic("Can't cleanup modules for config_only LOV ", self.name)
2252 Module.cleanup_module(self)
2253 self.osc.cleanup_module()
2255 def correct_level(self, level, op=None):
2258 class CMOBD(Module):
2259 def __init__(self,db):
2260 Module.__init__(self, 'CMOBD', db)
2261 self.name = self.db.getName();
2262 self.uuid = generate_client_uuid(self.name)
2263 self.master_uuid = self.db.get_first_ref('masterobd')
2264 self.cache_uuid = self.db.get_first_ref('cacheobd')
2265 self.add_lustre_module('cmobd', 'cmobd')
2266 master_obd = self.db.lookup(self.master_uuid)
2268 panic('master obd not found:', self.master_uuid)
2269 cache_obd = self.db.lookup(self.cache_uuid)
2271 panic('cache obd not found:', self.cache_uuid)
2273 if master_obd.get_class() == 'ost':
2274 self.client_uuid = generate_client_uuid(self.name)
2275 self.master= VLOV(master_obd, self.client_uuid, self.name,
2276 "%s_master" % (self.name))
2277 self.master_uuid = self.master.get_uuid()
2279 self.master = get_mdc(db, self.name, self.master_uuid)
2280 # need to check /proc/mounts and /etc/mtab before
2281 # formatting anything.
2282 # FIXME: check if device is already formatted.
2284 self.master.prepare()
2285 if not config.record and is_prepared(self.name):
2287 self.info(self.master_uuid, self.cache_uuid)
2288 lctl.newdev("cmobd", self.name, self.uuid,
2289 setup ="%s %s" %(self.master_uuid,
2293 if is_prepared(self.name):
2294 Module.cleanup(self)
2295 self.master.cleanup()
2297 def load_module(self):
2298 self.master.load_module()
2299 Module.load_module(self)
2301 def cleanup_module(self):
2302 Module.cleanup_module(self)
2303 self.master.cleanup_module()
2305 def correct_level(self, level, op=None):
2309 def __init__(self, db, uuid, name, type, name_override = None):
2310 Module.__init__(self, 'COBD', db)
2311 self.name = self.db.getName();
2312 self.uuid = generate_client_uuid(self.name)
2313 self.real_uuid = self.db.get_first_ref('realobd')
2314 self.cache_uuid = self.db.get_first_ref('cacheobd')
2315 self.add_lustre_module('cobd', 'cobd')
2316 real_obd = self.db.lookup(self.real_uuid)
2318 panic('real obd not found:', self.real_uuid)
2319 cache_obd = self.db.lookup(self.cache_uuid)
2321 panic('cache obd not found:', self.cache_uuid)
2323 self.real = LOV(real_obd, self.real_uuid, name,
2324 "%s_real" % (self.name));
2325 self.cache = LOV(cache_obd, self.cache_uuid, name,
2326 "%s_cache" % (self.name));
2328 self.real = get_mdc(db, name, self.real_uuid)
2329 self.cache = get_mdc(db, name, self.cache_uuid)
2330 # need to check /proc/mounts and /etc/mtab before
2331 # formatting anything.
2332 # FIXME: check if device is already formatted.
2337 def get_real_name(self):
2338 return self.real.name
2339 def get_cache_name(self):
2340 return self.cache.name
2343 self.cache.prepare()
2344 if not config.record and is_prepared(self.name):
2346 self.info(self.real_uuid, self.cache_uuid)
2347 lctl.newdev("cobd", self.name, self.uuid,
2348 setup ="%s %s" %(self.real.name,
2352 if is_prepared(self.name):
2353 Module.cleanup(self)
2355 self.cache.cleanup()
2357 def load_module(self):
2358 self.real.load_module()
2359 Module.load_module(self)
2361 def cleanup_module(self):
2362 Module.cleanup_module(self)
2363 self.real.cleanup_module()
2365 # virtual interface for OSC and LOV
2367 def __init__(self, db, client_uuid, name, name_override = None):
2368 Module.__init__(self, 'VOSC', db)
2369 if db.get_class() == 'lov':
2370 self.osc = LOV(db, client_uuid, name, name_override)
2372 elif db.get_class() == 'cobd':
2373 self.osc = COBD(db, client_uuid, name, 'obd')
2376 self.osc = OSC(db, client_uuid, name)
2379 return self.osc.get_uuid()
2381 return self.osc.get_name()
2386 def load_module(self):
2387 self.osc.load_module()
2388 def cleanup_module(self):
2389 self.osc.cleanup_module()
2390 def correct_level(self, level, op=None):
2391 return self.osc.correct_level(level, op)
2393 # virtual interface for MDC and LMV
2395 def __init__(self, db, client_uuid, name, name_override = None):
2396 Module.__init__(self, 'VMDC', db)
2397 if db.get_class() == 'lmv':
2398 self.mdc = LMV(db, client_uuid, name)
2399 elif db.get_class() == 'cobd':
2400 self.mdc = COBD(db, client_uuid, name, 'mds')
2402 self.mdc = MDC(db, client_uuid, name)
2404 return self.mdc.uuid
2406 return self.mdc.name
2411 def load_module(self):
2412 self.mdc.load_module()
2413 def cleanup_module(self):
2414 self.mdc.cleanup_module()
2415 def correct_level(self, level, op=None):
2416 return self.mdc.correct_level(level, op)
2418 class ECHO_CLIENT(Module):
2419 def __init__(self,db):
2420 Module.__init__(self, 'ECHO_CLIENT', db)
2421 self.add_lustre_module('obdecho', 'obdecho')
2422 self.obd_uuid = self.db.get_first_ref('obd')
2423 obd = self.db.lookup(self.obd_uuid)
2424 self.uuid = generate_client_uuid(self.name)
2425 self.osc = VOSC(obd, self.uuid, self.name)
2428 if not config.record and is_prepared(self.name):
2431 self.osc.prepare() # XXX This is so cheating. -p
2432 self.info(self.obd_uuid)
2434 lctl.newdev("echo_client", self.name, self.uuid,
2435 setup = self.osc.get_name())
2438 if is_prepared(self.name):
2439 Module.cleanup(self)
2442 def load_module(self):
2443 self.osc.load_module()
2444 Module.load_module(self)
2446 def cleanup_module(self):
2447 Module.cleanup_module(self)
2448 self.osc.cleanup_module()
2450 def correct_level(self, level, op=None):
2453 def generate_client_uuid(name):
2454 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2456 int(random.random() * 1048576),
2457 int(random.random() * 1048576))
2458 return client_uuid[:36]
2460 class Mountpoint(Module):
2461 def __init__(self,db):
2462 Module.__init__(self, 'MTPT', db)
2463 self.path = self.db.get_val('path')
2464 self.clientoptions = self.db.get_val('clientoptions', '')
2465 self.fs_uuid = self.db.get_first_ref('filesystem')
2466 fs = self.db.lookup(self.fs_uuid)
2467 self.mds_uuid = fs.get_first_ref('lmv')
2468 if not self.mds_uuid:
2469 self.mds_uuid = fs.get_first_ref('mds')
2470 self.obd_uuid = fs.get_first_ref('obd')
2471 self.mgmt_uuid = fs.get_first_ref('mgmt')
2472 client_uuid = generate_client_uuid(self.name)
2474 ost = self.db.lookup(self.obd_uuid)
2476 panic("no ost: ", self.obd_uuid)
2478 mds = self.db.lookup(self.mds_uuid)
2480 panic("no mds: ", self.mds_uuid)
2482 self.add_lustre_module('mdc', 'mdc')
2483 self.add_lustre_module('lmv', 'lmv')
2484 self.add_lustre_module('llite', 'llite')
2486 self.vosc = VOSC(ost, client_uuid, self.name)
2487 self.vmdc = VMDC(mds, client_uuid, self.name)
2490 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2496 if not config.record and fs_is_mounted(self.path):
2497 log(self.path, "already mounted.")
2501 self.mgmtcli.prepare()
2504 vmdc_name = self.vmdc.get_name()
2506 self.info(self.path, self.mds_uuid, self.obd_uuid)
2507 if config.record or config.lctl_dump:
2508 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2511 if config.clientoptions:
2512 if self.clientoptions:
2513 self.clientoptions = self.clientoptions + ',' + \
2514 config.clientoptions
2516 self.clientoptions = config.clientoptions
2517 if self.clientoptions:
2518 self.clientoptions = ',' + self.clientoptions
2519 # Linux kernel will deal with async and not pass it to ll_fill_super,
2520 # so replace it with Lustre async
2521 self.clientoptions = string.replace(self.clientoptions, "async",
2524 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2525 (self.vosc.get_name(), vmdc_name, self.clientoptions,
2526 config.config, self.path)
2527 run("mkdir", self.path)
2532 panic("mount failed:", self.path, ":", string.join(val))
2535 self.info(self.path, self.mds_uuid,self.obd_uuid)
2537 if config.record or config.lctl_dump:
2538 lctl.del_mount_option(local_node_name)
2540 if fs_is_mounted(self.path):
2542 (rc, out) = run("umount", "-f", self.path)
2544 (rc, out) = run("umount", self.path)
2546 raise CommandError('umount', out, rc)
2548 if fs_is_mounted(self.path):
2549 panic("fs is still mounted:", self.path)
2554 self.mgmtcli.cleanup()
2556 def load_module(self):
2558 self.mgmtcli.load_module()
2559 self.vosc.load_module()
2560 Module.load_module(self)
2562 def cleanup_module(self):
2563 Module.cleanup_module(self)
2564 self.vosc.cleanup_module()
2566 self.mgmtcli.cleanup_module()
2568 def correct_level(self, level, op=None):
2571 # ============================================================
2572 # misc query functions
2574 def get_ost_net(self, osd_uuid):
2578 osd = self.lookup(osd_uuid)
2579 node_uuid = osd.get_first_ref('node')
2580 node = self.lookup(node_uuid)
2582 panic("unable to find node for osd_uuid:", osd_uuid,
2583 " node_ref:", node_uuid_)
2584 for net_uuid in node.get_networks():
2585 db = node.lookup(net_uuid)
2586 srv_list.append(Network(db))
2590 # the order of iniitailization is based on level.
2591 def getServiceLevel(self):
2592 type = self.get_class()
2594 if type in ('network',):
2596 elif type in ('routetbl',):
2598 elif type in ('ldlm',):
2600 elif type in ('mgmt',):
2602 elif type in ('osd', 'cobd'):
2604 elif type in ('mdsdev',):
2606 elif type in ('lmv',):
2608 elif type in ('cmobd',):
2610 elif type in ('mountpoint', 'echoclient'):
2613 panic("Unknown type: ", type)
2615 if ret < config.minlevel or ret > config.maxlevel:
2620 # return list of services in a profile. list is a list of tuples
2621 # [(level, db_object),]
2622 def getServices(self):
2624 for ref_class, ref_uuid in self.get_all_refs():
2625 servdb = self.lookup(ref_uuid)
2627 level = getServiceLevel(servdb)
2629 list.append((level, servdb))
2631 panic('service not found: ' + ref_uuid)
2637 ############################################################
2639 # FIXME: clean this mess up!
2641 # OSC is no longer in the xml, so we have to fake it.
2642 # this is getting ugly and begging for another refactoring
2643 def get_osc(ost_db, uuid, fs_name):
2644 osc = OSC(ost_db, uuid, fs_name)
2647 def get_mdc(db, fs_name, mds_uuid):
2648 mds_db = db.lookup(mds_uuid);
2650 error("no mds:", mds_uuid)
2651 mdc = MDC(mds_db, mds_uuid, fs_name)
2654 ############################################################
2655 # routing ("rooting")
2656 # list of (nettype, cluster_id, nid)
2659 def find_local_clusters(node_db):
2660 global local_clusters
2661 for netuuid in node_db.get_networks():
2662 net = node_db.lookup(netuuid)
2664 debug("add_local", netuuid)
2665 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2667 if acceptors.has_key(srv.port):
2668 panic("duplicate port:", srv.port)
2669 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2670 srv.send_mem, srv.recv_mem,
2673 # This node is a gateway.
2675 def node_is_router():
2678 # If there are any routers found in the config, then this will be true
2679 # and all nodes will load kptlrouter.
2681 def node_needs_router():
2682 return needs_router or is_router
2684 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2685 # Currently, these local routes are only added to kptlrouter route
2686 # table if they are needed to connect to a specific server. This
2687 # should be changed so all available routes are loaded, and the
2688 # ptlrouter can make all the decisions.
2691 def find_local_routes(lustre):
2692 """ Scan the lustre config looking for routers . Build list of
2694 global local_routes, needs_router
2696 list = lustre.lookup_class('node')
2698 if router.get_val_int('router', 0):
2700 for (local_type, local_cluster_id, local_nid) in local_clusters:
2702 for netuuid in router.get_networks():
2703 db = router.lookup(netuuid)
2704 if (local_type == db.get_val('nettype') and
2705 local_cluster_id == db.get_val('clusterid')):
2706 gw = db.get_val('nid')
2709 debug("find_local_routes: gw is", gw)
2710 for route in router.get_local_routes(local_type, gw):
2711 local_routes.append(route)
2712 debug("find_local_routes:", local_routes)
2715 def choose_local_server(srv_list):
2716 for srv in srv_list:
2717 if local_cluster(srv.net_type, srv.cluster_id):
2720 def local_cluster(net_type, cluster_id):
2721 for cluster in local_clusters:
2722 if net_type == cluster[0] and cluster_id == cluster[1]:
2726 def local_interface(net_type, cluster_id, nid):
2727 for cluster in local_clusters:
2728 if (net_type == cluster[0] and cluster_id == cluster[1]
2729 and nid == cluster[2]):
2733 def find_route(srv_list):
2735 frm_type = local_clusters[0][0]
2736 for srv in srv_list:
2737 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2738 to_type = srv.net_type
2740 cluster_id = srv.cluster_id
2741 debug ('looking for route to', to_type, to)
2742 for r in local_routes:
2743 debug("find_route: ", r)
2744 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2745 result.append((srv, r))
2748 def get_active_target(db):
2749 target_uuid = db.getUUID()
2750 target_name = db.getName()
2751 node_name = get_select(target_name)
2753 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2755 tgt_dev_uuid = db.get_first_ref('active')
2758 def get_server_by_nid_uuid(db, nid_uuid):
2759 for n in db.lookup_class("network"):
2761 if net.nid_uuid == nid_uuid:
2765 ############################################################
2769 type = db.get_class()
2770 debug('Service:', type, db.getName(), db.getUUID())
2775 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2776 elif type == 'network':
2778 elif type == 'routetbl':
2782 elif type == 'cobd':
2783 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2784 elif type == 'cmobd':
2786 elif type == 'mdsdev':
2788 elif type == 'mountpoint':
2790 elif type == 'echoclient':
2792 elif type == 'mgmt':
2797 panic ("unknown service type:", type)
2801 # Prepare the system to run lustre using a particular profile
2802 # in a the configuration.
2803 # * load & the modules
2804 # * setup networking for the current node
2805 # * make sure partitions are in place and prepared
2806 # * initialize devices with lctl
2807 # Levels is important, and needs to be enforced.
2808 def for_each_profile(db, prof_list, operation):
2809 for prof_uuid in prof_list:
2810 prof_db = db.lookup(prof_uuid)
2812 panic("profile:", profile, "not found.")
2813 services = getServices(prof_db)
2816 def magic_get_osc(db, rec, lov):
2818 lov_uuid = lov.get_uuid()
2819 lov_name = lov.osc.fs_name
2821 lov_uuid = rec.getAttribute('lov_uuidref')
2822 # FIXME: better way to find the mountpoint?
2823 filesystems = db.root_node.getElementsByTagName('filesystem')
2825 for fs in filesystems:
2826 ref = fs.getElementsByTagName('obd_ref')
2827 if ref[0].getAttribute('uuidref') == lov_uuid:
2828 fsuuid = fs.getAttribute('uuid')
2832 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2834 mtpts = db.root_node.getElementsByTagName('mountpoint')
2837 ref = fs.getElementsByTagName('filesystem_ref')
2838 if ref[0].getAttribute('uuidref') == fsuuid:
2839 lov_name = fs.getAttribute('name')
2843 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2845 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2847 ost_uuid = rec.getAttribute('ost_uuidref')
2848 obd = db.lookup(ost_uuid)
2851 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
2853 osc = get_osc(obd, lov_uuid, lov_name)
2855 panic('osc not found:', obd_uuid)
2858 # write logs for update records. sadly, logs of all types -- and updates in
2859 # particular -- are something of an afterthought. lconf needs rewritten with
2860 # these as core concepts. so this is a pretty big hack.
2861 def process_update_record(db, update, lov):
2862 for rec in update.childNodes:
2863 if rec.nodeType != rec.ELEMENT_NODE:
2866 log("found "+rec.nodeName+" record in update version " +
2867 str(update.getAttribute('version')))
2869 lov_uuid = rec.getAttribute('lov_uuidref')
2870 ost_uuid = rec.getAttribute('ost_uuidref')
2871 index = rec.getAttribute('index')
2872 gen = rec.getAttribute('generation')
2874 if not lov_uuid or not ost_uuid or not index or not gen:
2875 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
2878 tmplov = db.lookup(lov_uuid)
2880 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
2881 lov_name = tmplov.getName()
2883 lov_name = lov.osc.name
2885 # ------------------------------------------------------------- add
2886 if rec.nodeName == 'add':
2888 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2891 osc = magic_get_osc(db, rec, lov)
2894 # Only ignore connect failures with --force, which
2895 # isn't implemented here yet.
2896 osc.prepare(ignore_connect_failure=0)
2897 except CommandError, e:
2898 print "Error preparing OSC %s\n" % osc.uuid
2901 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2903 # ------------------------------------------------------ deactivate
2904 elif rec.nodeName == 'deactivate':
2908 osc = magic_get_osc(db, rec, lov)
2912 except CommandError, e:
2913 print "Error deactivating OSC %s\n" % osc.uuid
2916 # ---------------------------------------------------------- delete
2917 elif rec.nodeName == 'delete':
2921 osc = magic_get_osc(db, rec, lov)
2927 except CommandError, e:
2928 print "Error cleaning up OSC %s\n" % osc.uuid
2931 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2933 def process_updates(db, log_device, log_name, lov = None):
2934 updates = db.root_node.getElementsByTagName('update')
2936 if not u.childNodes:
2937 log("ignoring empty update record (version " +
2938 str(u.getAttribute('version')) + ")")
2941 version = u.getAttribute('version')
2942 real_name = "%s-%s" % (log_name, version)
2943 lctl.clear_log(log_device, real_name)
2944 lctl.record(log_device, real_name)
2946 process_update_record(db, u, lov)
2950 def doWriteconf(services):
2954 if s[1].get_class() == 'mdsdev':
2955 n = newService(s[1])
2958 def doSetup(services):
2963 n = newService(s[1])
2965 slist.append((n.level, n))
2968 nl = n[1].correct_level(n[0])
2969 nlist.append((nl, n[1]))
2974 def doModules(services):
2978 n = newService(s[1])
2981 def doCleanup(services):
2986 n = newService(s[1])
2988 slist.append((n.level, n))
2991 nl = n[1].correct_level(n[0])
2992 nlist.append((nl, n[1]))
2996 if n[1].safe_to_clean():
2999 def doUnloadModules(services):
3004 n = newService(s[1])
3005 if n.safe_to_clean_modules():
3010 def doHost(lustreDB, hosts):
3011 global is_router, local_node_name
3014 node_db = lustreDB.lookup_name(h, 'node')
3018 panic('No host entry found.')
3020 local_node_name = node_db.get_val('name', 0)
3021 is_router = node_db.get_val_int('router', 0)
3022 lustre_upcall = node_db.get_val('lustreUpcall', '')
3023 portals_upcall = node_db.get_val('portalsUpcall', '')
3024 timeout = node_db.get_val_int('timeout', 0)
3025 ptldebug = node_db.get_val('ptldebug', '')
3026 subsystem = node_db.get_val('subsystem', '')
3028 find_local_clusters(node_db)
3030 find_local_routes(lustreDB)
3032 # Two step process: (1) load modules, (2) setup lustre
3033 # if not cleaning, load modules first.
3034 prof_list = node_db.get_refs('profile')
3036 if config.write_conf:
3038 for_each_profile(node_db, prof_list, doModules)
3040 for_each_profile(node_db, prof_list, doWriteconf)
3041 for_each_profile(node_db, prof_list, doUnloadModules)
3043 elif config.recover:
3044 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3045 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3046 "--client_uuid <UUID> --conn_uuid <UUID>")
3047 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3049 elif config.cleanup:
3051 # the command line can override this value
3053 # ugly hack, only need to run lctl commands for --dump
3054 if config.lctl_dump or config.record:
3055 for_each_profile(node_db, prof_list, doCleanup)
3058 sys_set_timeout(timeout)
3059 sys_set_ptldebug(ptldebug)
3060 sys_set_subsystem(subsystem)
3061 sys_set_lustre_upcall(lustre_upcall)
3062 sys_set_portals_upcall(portals_upcall)
3064 for_each_profile(node_db, prof_list, doCleanup)
3065 for_each_profile(node_db, prof_list, doUnloadModules)
3069 # ugly hack, only need to run lctl commands for --dump
3070 if config.lctl_dump or config.record:
3071 sys_set_timeout(timeout)
3072 sys_set_lustre_upcall(lustre_upcall)
3073 for_each_profile(node_db, prof_list, doSetup)
3077 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3078 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3080 for_each_profile(node_db, prof_list, doModules)
3082 sys_set_debug_path()
3083 sys_set_ptldebug(ptldebug)
3084 sys_set_subsystem(subsystem)
3085 script = config.gdb_script
3086 run(lctl.lctl, ' modules >', script)
3088 log ("The GDB module script is in", script)
3089 # pause, so user has time to break and
3092 sys_set_timeout(timeout)
3093 sys_set_lustre_upcall(lustre_upcall)
3094 sys_set_portals_upcall(portals_upcall)
3096 for_each_profile(node_db, prof_list, doSetup)
3099 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3100 tgt = lustreDB.lookup(tgt_uuid)
3102 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3103 new_uuid = get_active_target(tgt)
3105 raise Lustre.LconfError("doRecovery: no active target found for: " +
3107 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3109 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3111 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3113 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3116 lctl.disconnect(oldnet)
3117 except CommandError, e:
3118 log("recover: disconnect", nid_uuid, "failed: ")
3123 except CommandError, e:
3124 log("recover: connect failed")
3127 lctl.recover(client_uuid, net.nid_uuid)
3130 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3131 base = os.path.dirname(cmd)
3132 if development_mode():
3133 if not config.lustre:
3134 debug('using objdir module paths')
3135 config.lustre = (os.path.join(base, ".."))
3136 # normalize the portals dir, using command line arg if set
3138 portals_dir = config.portals
3139 dir = os.path.join(config.lustre, portals_dir)
3140 config.portals = dir
3141 debug('config.portals', config.portals)
3142 elif config.lustre and config.portals:
3144 # if --lustre and --portals, normalize portals
3145 # can ignore POTRALS_DIR here, since it is probly useless here
3146 config.portals = os.path.join(config.lustre, config.portals)
3147 debug('config.portals B', config.portals)
3149 def sysctl(path, val):
3150 debug("+ sysctl", path, val)
3154 fp = open(os.path.join('/proc/sys', path), 'w')
3161 def sys_set_debug_path():
3162 sysctl('portals/debug_path', config.debug_path)
3164 def sys_set_lustre_upcall(upcall):
3165 # the command overrides the value in the node config
3166 if config.lustre_upcall:
3167 upcall = config.lustre_upcall
3169 upcall = config.upcall
3171 lctl.set_lustre_upcall(upcall)
3173 def sys_set_portals_upcall(upcall):
3174 # the command overrides the value in the node config
3175 if config.portals_upcall:
3176 upcall = config.portals_upcall
3178 upcall = config.upcall
3180 sysctl('portals/upcall', upcall)
3182 def sys_set_timeout(timeout):
3183 # the command overrides the value in the node config
3184 if config.timeout and config.timeout > 0:
3185 timeout = config.timeout
3186 if timeout != None and timeout > 0:
3187 lctl.set_timeout(timeout)
3189 def sys_tweak_socknal ():
3190 if config.single_socket:
3191 sysctl("socknal/typed", 0)
3193 def sys_optimize_elan ():
3194 procfiles = ["/proc/elan/config/eventint_punt_loops",
3195 "/proc/qsnet/elan3/config/eventint_punt_loops",
3196 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3198 if os.access(p, os.R_OK):
3199 run ("echo 1 > " + p)
3201 def sys_set_ptldebug(ptldebug):
3203 ptldebug = config.ptldebug
3206 val = eval(ptldebug, ptldebug_names)
3207 val = "0x%x" % (val)
3208 sysctl('portals/debug', val)
3209 except NameError, e:
3212 def sys_set_subsystem(subsystem):
3213 if config.subsystem:
3214 subsystem = config.subsystem
3217 val = eval(subsystem, subsystem_names)
3218 val = "0x%x" % (val)
3219 sysctl('portals/subsystem_debug', val)
3220 except NameError, e:
3223 def sys_set_netmem_max(path, max):
3224 debug("setting", path, "to at least", max)
3232 fp = open(path, 'w')
3233 fp.write('%d\n' %(max))
3237 def sys_make_devices():
3238 if not os.access('/dev/portals', os.R_OK):
3239 run('mknod /dev/portals c 10 240')
3240 if not os.access('/dev/obd', os.R_OK):
3241 run('mknod /dev/obd c 10 241')
3244 # Add dir to the global PATH, if not already there.
3245 def add_to_path(new_dir):
3246 syspath = string.split(os.environ['PATH'], ':')
3247 if new_dir in syspath:
3249 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3251 def default_debug_path():
3252 path = '/tmp/lustre-log'
3253 if os.path.isdir('/r'):
3258 def default_gdb_script():
3259 script = '/tmp/ogdb'
3260 if os.path.isdir('/r'):
3261 return '/r' + script
3266 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3267 # ensure basic elements are in the system path
3268 def sanitise_path():
3269 for dir in DEFAULT_PATH:
3272 # global hack for the --select handling
3274 def init_select(args):
3275 # args = [service=nodeA,service2=nodeB service3=nodeC]
3278 list = string.split(arg, ',')
3280 srv, node = string.split(entry, '=')
3281 tgt_select[srv] = node
3283 def get_select(srv):
3284 if tgt_select.has_key(srv):
3285 return tgt_select[srv]
3289 FLAG = Lustre.Options.FLAG
3290 PARAM = Lustre.Options.PARAM
3291 INTPARAM = Lustre.Options.INTPARAM
3292 PARAMLIST = Lustre.Options.PARAMLIST
3294 ('verbose,v', "Print system commands as they are run"),
3295 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3296 ('config', "Cluster config name used for LDAP query", PARAM),
3297 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3298 ('node', "Load config for <nodename>", PARAM),
3299 ('cleanup,d', "Cleans up config. (Shutdown)"),
3300 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3302 ('single_socket', "socknal option: only use one socket instead of bundle",
3304 ('failover',"""Used to shut down without saving state.
3305 This will allow this node to "give up" a service to a
3306 another node for failover purposes. This will not
3307 be a clean shutdown.""",
3309 ('gdb', """Prints message after creating gdb module script
3310 and sleeps for 5 seconds."""),
3311 ('noexec,n', """Prints the commands and steps that will be run for a
3312 config without executing them. This can used to check if a
3313 config file is doing what it should be doing"""),
3314 ('nomod', "Skip load/unload module step."),
3315 ('nosetup', "Skip device setup/cleanup step."),
3316 ('reformat', "Reformat all devices (without question)"),
3317 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3318 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3319 ('clientoptions', "Additional options for Lustre", PARAM),
3320 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3322 ('write_conf', "Save all the client config information on mds."),
3323 ('record', "Write config information on mds."),
3324 ('record_log', "Name of config record log.", PARAM),
3325 ('record_device', "MDS device name that will record the config commands",
3327 ('root_squash', "MDS squash root to appointed uid",
3329 ('no_root_squash', "Don't squash root for appointed nid",
3331 ('minlevel', "Minimum level of services to configure/cleanup",
3333 ('maxlevel', """Maximum level of services to configure/cleanup
3334 Levels are aproximatly like:
3339 70 - mountpoint, echo_client, osc, mdc, lov""",
3341 ('lustre', """Base directory of lustre sources. This parameter will
3342 cause lconf to load modules from a source tree.""", PARAM),
3343 ('portals', """Portals source directory. If this is a relative path,
3344 then it is assumed to be relative to lustre. """, PARAM),
3345 ('timeout', "Set recovery timeout", INTPARAM),
3346 ('upcall', "Set both portals and lustre upcall script", PARAM),
3347 ('lustre_upcall', "Set lustre upcall script", PARAM),
3348 ('portals_upcall', "Set portals upcall script", PARAM),
3349 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3350 ('ptldebug', "Set the portals debug level", PARAM),
3351 ('subsystem', "Set the portals debug subsystem", PARAM),
3352 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3353 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3354 # Client recovery options
3355 ('recover', "Recover a device"),
3356 ('group', "The group of devices to configure or cleanup", PARAM),
3357 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3358 ('client_uuid', "The failed client (required for recovery)", PARAM),
3359 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3361 ('inactive', """The name of an inactive service, to be ignored during
3362 mounting (currently OST-only). Can be repeated.""",
3367 global lctl, config, toplustreDB, CONFIG_FILE
3369 # in the upcall this is set to SIG_IGN
3370 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3372 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3374 config, args = cl.parse(sys.argv[1:])
3375 except Lustre.OptionError, e:
3379 setupModulePath(sys.argv[0])
3381 host = socket.gethostname()
3383 # the PRNG is normally seeded with time(), which is not so good for starting
3384 # time-synchronized clusters
3385 input = open('/dev/urandom', 'r')
3387 print 'Unable to open /dev/urandom!'
3389 seed = input.read(32)
3395 init_select(config.select)
3398 # allow config to be fetched via HTTP, but only with python2
3399 if sys.version[0] != '1' and args[0].startswith('http://'):
3402 config_file = urllib2.urlopen(args[0])
3403 except (urllib2.URLError, socket.error), err:
3404 if hasattr(err, 'args'):
3406 print "Could not access '%s': %s" %(args[0], err)
3408 elif not os.access(args[0], os.R_OK):
3409 print 'File not found or readable:', args[0]
3413 config_file = open(args[0], 'r')
3415 dom = xml.dom.minidom.parse(config_file)
3417 panic("%s does not appear to be a config file." % (args[0]))
3418 sys.exit(1) # make sure to die here, even in debug mode.
3420 CONFIG_FILE = args[0]
3421 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3422 if not config.config:
3423 config.config = os.path.basename(args[0])# use full path?
3424 if config.config[-4:] == '.xml':
3425 config.config = config.config[:-4]
3426 elif config.ldapurl:
3427 if not config.config:
3428 panic("--ldapurl requires --config name")
3429 dn = "config=%s,fs=lustre" % (config.config)
3430 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3431 elif config.ptldebug or config.subsystem:
3432 sys_set_ptldebug(None)
3433 sys_set_subsystem(None)
3436 print 'Missing config file or ldap URL.'
3437 print 'see lconf --help for command summary'
3440 toplustreDB = lustreDB
3442 ver = lustreDB.get_version()
3444 panic("No version found in config data, please recreate.")
3445 if ver != Lustre.CONFIG_VERSION:
3446 panic("Config version", ver, "does not match lconf version",
3447 Lustre.CONFIG_VERSION)
3451 node_list.append(config.node)
3454 node_list.append(host)
3455 node_list.append('localhost')
3457 debug("configuring for host: ", node_list)
3460 config.debug_path = config.debug_path + '-' + host
3461 config.gdb_script = config.gdb_script + '-' + host
3463 lctl = LCTLInterface('lctl')
3465 if config.lctl_dump:
3466 lctl.use_save_file(config.lctl_dump)
3469 if not (config.record_device and config.record_log):
3470 panic("When recording, both --record_log and --record_device must be specified.")
3471 lctl.clear_log(config.record_device, config.record_log)
3472 lctl.record(config.record_device, config.record_log)
3474 doHost(lustreDB, node_list)
3476 if not config.record:
3481 process_updates(lustreDB, config.record_device, config.record_log)
3483 if __name__ == "__main__":
3486 except Lustre.LconfError, e:
3488 # traceback.print_exc(file=sys.stdout)
3490 except CommandError, e:
3494 if first_cleanup_error:
3495 sys.exit(first_cleanup_error)