3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
106 "socknal" : (1 << 11),
107 "qswnal" : (1 << 12),
108 "pinger" : (1 << 13),
109 "filter" : (1 << 14),
115 "ptlrouter" : (1 << 20),
122 first_cleanup_error = 0
123 def cleanup_error(rc):
124 global first_cleanup_error
125 if not first_cleanup_error:
126 first_cleanup_error = rc
128 # ============================================================
129 # debugging and error funcs
131 def fixme(msg = "this feature"):
132 raise Lustre.LconfError, msg + ' not implemented yet.'
135 msg = string.join(map(str,args))
136 if not config.noexec:
137 raise Lustre.LconfError(msg)
142 msg = string.join(map(str,args))
147 print string.strip(s)
151 msg = string.join(map(str,args))
154 # ack, python's builtin int() does not support '0x123' syntax.
155 # eval can do it, although what a hack!
159 return eval(s, {}, {})
162 except SyntaxError, e:
163 raise ValueError("not a number")
165 raise ValueError("not a number")
167 # ============================================================
168 # locally defined exceptions
169 class CommandError (exceptions.Exception):
170 def __init__(self, cmd_name, cmd_err, rc=None):
171 self.cmd_name = cmd_name
172 self.cmd_err = cmd_err
177 if type(self.cmd_err) == types.StringType:
179 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
181 print "! %s: %s" % (self.cmd_name, self.cmd_err)
182 elif type(self.cmd_err) == types.ListType:
184 print "! %s (error %d):" % (self.cmd_name, self.rc)
186 print "! %s:" % (self.cmd_name)
187 for s in self.cmd_err:
188 print "> %s" %(string.strip(s))
193 # ============================================================
194 # handle daemons, like the acceptor
196 """ Manage starting and stopping a daemon. Assumes daemon manages
197 it's own pid file. """
199 def __init__(self, cmd):
205 log(self.command, "already running.")
207 self.path = find_prog(self.command)
209 panic(self.command, "not found.")
210 ret, out = runcmd(self.path +' '+ self.command_line())
212 raise CommandError(self.path, out, ret)
216 pid = self.read_pidfile()
218 log ("killing process", pid)
220 #time.sleep(1) # let daemon die
222 log("unable to kill", self.command, e)
224 log("unable to kill", self.command)
227 pid = self.read_pidfile()
237 def read_pidfile(self):
239 fp = open(self.pidfile(), 'r')
246 def clean_pidfile(self):
247 """ Remove a stale pidfile """
248 log("removing stale pidfile:", self.pidfile())
250 os.unlink(self.pidfile())
252 log(self.pidfile(), e)
254 class AcceptorHandler(DaemonHandler):
255 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
256 DaemonHandler.__init__(self, "acceptor")
259 self.send_mem = send_mem
260 self.recv_mem = recv_mem
263 self.flags = self.flags + ' -i'
266 return "/var/run/%s-%d.pid" % (self.command, self.port)
268 def command_line(self):
269 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
273 # start the acceptors
275 if config.lctl_dump or config.record:
277 for port in acceptors.keys():
278 daemon = acceptors[port]
279 if not daemon.running():
282 def run_one_acceptor(port):
283 if config.lctl_dump or config.record:
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
287 if not daemon.running():
290 panic("run_one_acceptor: No acceptor defined for port:", port)
292 def stop_acceptor(port):
293 if acceptors.has_key(port):
294 daemon = acceptors[port]
299 # ============================================================
300 # handle lctl interface
303 Manage communication with lctl
306 def __init__(self, cmd):
308 Initialize close by finding the lctl binary.
310 self.lctl = find_prog(cmd)
312 self.record_device = ''
315 debug('! lctl not found')
318 raise CommandError('lctl', "unable to find lctl binary.")
320 def use_save_file(self, file):
321 self.save_file = file
323 def record(self, dev_name, logname):
324 log("Recording log", logname, "on", dev_name)
325 self.record_device = dev_name
326 self.record_log = logname
328 def end_record(self):
329 log("End recording log", self.record_log, "on", self.record_device)
330 self.record_device = None
331 self.record_log = None
333 def set_nonblock(self, fd):
334 fl = fcntl.fcntl(fd, F_GETFL)
335 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
340 the cmds are written to stdin of lctl
341 lctl doesn't return errors when run in script mode, so
343 should modify command line to accept multiple commands, or
344 create complex command line options
348 cmds = '\n dump ' + self.save_file + '\n' + cmds
349 elif self.record_device:
353 %s""" % (self.record_device, self.record_log, cmds)
355 debug("+", cmd_line, cmds)
356 if config.noexec: return (0, [])
358 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
359 child.tochild.write(cmds + "\n")
360 child.tochild.close()
361 # print "LCTL:", cmds
363 # From "Python Cookbook" from O'Reilly
364 outfile = child.fromchild
365 outfd = outfile.fileno()
366 self.set_nonblock(outfd)
367 errfile = child.childerr
368 errfd = errfile.fileno()
369 self.set_nonblock(errfd)
371 outdata = errdata = ''
374 ready = select.select([outfd,errfd],[],[]) # Wait for input
375 if outfd in ready[0]:
376 outchunk = outfile.read()
377 if outchunk == '': outeof = 1
378 outdata = outdata + outchunk
379 if errfd in ready[0]:
380 errchunk = errfile.read()
381 if errchunk == '': erreof = 1
382 errdata = errdata + errchunk
383 if outeof and erreof: break
384 # end of "borrowed" code
387 if os.WIFEXITED(ret):
388 rc = os.WEXITSTATUS(ret)
391 if rc or len(errdata):
392 raise CommandError(self.lctl, errdata, rc)
395 def runcmd(self, *args):
397 run lctl using the command line
399 cmd = string.join(map(str,args))
400 debug("+", self.lctl, cmd)
401 rc, out = run(self.lctl, cmd)
403 raise CommandError(self.lctl, out, rc)
407 def clear_log(self, dev, log):
408 """ clear an existing log """
413 quit """ % (dev, log)
416 def network(self, net, nid):
421 quit """ % (net, nid)
424 # create a new connection
425 def add_uuid(self, net_type, uuid, nid):
426 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
429 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
431 if net_type in ('tcp',) and not config.lctl_dump:
436 add_autoconn %s %s %d %s
440 nid, hostaddr, port, flags )
443 def connect(self, srv):
444 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
445 if srv.net_type in ('tcp',) and not config.lctl_dump:
449 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
450 srv.nid, srv.hostaddr, srv.port, flags)
453 def recover(self, dev_name, new_conn):
456 recover %s""" %(dev_name, new_conn)
459 # add a route to a range
460 def add_route(self, net, gw, lo, hi):
468 except CommandError, e:
472 def del_route(self, net, gw, lo, hi):
477 quit """ % (net, gw, lo, hi)
480 # add a route to a host
481 def add_route_host(self, net, uuid, gw, tgt):
482 self.add_uuid(net, uuid, tgt)
490 except CommandError, e:
494 # add a route to a range
495 def del_route_host(self, net, uuid, gw, tgt):
501 quit """ % (net, gw, tgt)
505 def del_autoconn(self, net_type, nid, hostaddr):
506 if net_type in ('tcp',) and not config.lctl_dump:
515 # disconnect one connection
516 def disconnect(self, srv):
517 self.del_uuid(srv.nid_uuid)
518 if srv.net_type in ('tcp',) and not config.lctl_dump:
519 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
521 def del_uuid(self, uuid):
529 def disconnectAll(self, net):
537 def attach(self, type, name, uuid):
540 quit""" % (type, name, uuid)
543 def setup(self, name, setup = ""):
547 quit""" % (name, setup)
551 # create a new device with lctl
552 def newdev(self, type, name, uuid, setup = ""):
553 self.attach(type, name, uuid);
555 self.setup(name, setup)
556 except CommandError, e:
557 self.cleanup(name, uuid, 0)
562 def cleanup(self, name, uuid, force, failover = 0):
563 if failover: force = 1
569 quit""" % (name, ('', 'force')[force],
570 ('', 'failover')[failover])
574 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
575 stripe_sz, stripe_off, pattern):
578 lov_setup %s %d %d %d %s
579 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
582 # add an OBD to a LOV
583 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
585 lov_modify_tgts add %s %s %s %s
586 quit""" % (name, obd_uuid, index, gen)
590 def lmv_setup(self, name, uuid, desc_uuid, devlist):
594 quit""" % (name, uuid, desc_uuid, devlist)
597 # delete an OBD from a LOV
598 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
600 lov_modify_tgts del %s %s %s %s
601 quit""" % (name, obd_uuid, index, gen)
605 def deactivate(self, name):
613 def dump(self, dump_file):
616 quit""" % (dump_file)
619 # get list of devices
620 def device_list(self):
621 devices = '/proc/fs/lustre/devices'
623 if os.access(devices, os.R_OK):
625 fp = open(devices, 'r')
633 def lustre_version(self):
634 rc, out = self.runcmd('version')
638 def mount_option(self, profile, osc, mdc):
640 mount_option %s %s %s
641 quit""" % (profile, osc, mdc)
644 # delete mount options
645 def del_mount_option(self, profile):
651 def set_timeout(self, timeout):
657 def set_lustre_upcall(self, upcall):
662 # ============================================================
663 # Various system-level functions
664 # (ideally moved to their own module)
666 # Run a command and return the output and status.
667 # stderr is sent to /dev/null, could use popen3 to
668 # save it if necessary
671 if config.noexec: return (0, [])
672 f = os.popen(cmd + ' 2>&1')
682 cmd = string.join(map(str,args))
685 # Run a command in the background.
686 def run_daemon(*args):
687 cmd = string.join(map(str,args))
689 if config.noexec: return 0
690 f = os.popen(cmd + ' 2>&1')
698 # Determine full path to use for an external command
699 # searches dirname(argv[0]) first, then PATH
701 syspath = string.split(os.environ['PATH'], ':')
702 cmdpath = os.path.dirname(sys.argv[0])
703 syspath.insert(0, cmdpath);
705 syspath.insert(0, os.path.join(config.portals, 'utils/'))
707 prog = os.path.join(d,cmd)
708 if os.access(prog, os.X_OK):
712 # Recursively look for file starting at base dir
713 def do_find_file(base, mod):
714 fullname = os.path.join(base, mod)
715 if os.access(fullname, os.R_OK):
717 for d in os.listdir(base):
718 dir = os.path.join(base,d)
719 if os.path.isdir(dir):
720 module = do_find_file(dir, mod)
724 def find_module(src_dir, dev_dir, modname):
725 modbase = src_dir +'/'+ dev_dir +'/'+ modname
726 for modext in '.ko', '.o':
727 module = modbase + modext
729 if os.access(module, os.R_OK):
735 # is the path a block device?
742 return stat.S_ISBLK(s[stat.ST_MODE])
744 # build fs according to type
746 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
752 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
754 # devsize is in 1k, and fs block count is in 4k
755 block_cnt = devsize/4
757 if fstype in ('ext3', 'extN', 'ldiskfs'):
758 # ext3 journal size is in megabytes
761 if not is_block(dev):
762 ret, out = runcmd("ls -l %s" %dev)
763 devsize = int(string.split(out[0])[4]) / 1024
765 ret, out = runcmd("sfdisk -s %s" %dev)
766 devsize = int(out[0])
767 if devsize > 1024 * 1024:
768 jsize = ((devsize / 102400) * 4)
771 if jsize: jopt = "-J size=%d" %(jsize,)
772 if isize: iopt = "-I %d" %(isize,)
773 mkfs = 'mkfs.ext2 -j -b 4096 '
774 if not isblock or config.force:
776 elif fstype == 'reiserfs':
777 # reiserfs journal size is in blocks
778 if jsize: jopt = "--journal_size %d" %(jsize,)
779 mkfs = 'mkreiserfs -ff'
781 panic('unsupported fs type: ', fstype)
783 if config.mkfsoptions != None:
784 mkfs = mkfs + ' ' + config.mkfsoptions
785 if mkfsoptions != None:
786 mkfs = mkfs + ' ' + mkfsoptions
787 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
789 panic("Unable to build fs:", dev, string.join(out))
790 # enable hash tree indexing on fsswe
791 if fstype in ('ext3', 'extN', 'ldiskfs'):
792 htree = 'echo "feature FEATURE_C5" | debugfs -w'
793 (ret, out) = run (htree, dev)
795 panic("Unable to enable htree:", dev)
797 # some systems use /dev/loopN, some /dev/loop/N
801 if not os.access(loop + str(0), os.R_OK):
803 if not os.access(loop + str(0), os.R_OK):
804 panic ("can't access loop devices")
807 # find loop device assigned to the file
808 def find_assigned_loop(file):
810 for n in xrange(0, MAX_LOOP_DEVICES):
812 if os.access(dev, os.R_OK):
813 (stat, out) = run('losetup', dev)
814 if out and stat == 0:
815 m = re.search(r'\((.*)\)', out[0])
816 if m and file == m.group(1):
822 # create file if necessary and assign the first free loop device
823 def init_loop(file, size, fstype, journal_size, inode_size,
824 mkfsoptions, reformat, autoformat, backfstype, backfile):
827 realfstype = backfstype
828 if is_block(backfile):
829 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
830 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
836 dev = find_assigned_loop(realfile)
838 print 'WARNING file:', realfile, 'already mapped to', dev
841 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
843 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
844 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
846 panic("Unable to create backing store:", realfile)
848 mkfs(realfile, size, realfstype, journal_size, inode_size,
849 mkfsoptions, isblock=0)
852 # find next free loop
853 for n in xrange(0, MAX_LOOP_DEVICES):
855 if os.access(dev, os.R_OK):
856 (stat, out) = run('losetup', dev)
858 run('losetup', dev, realfile)
861 print "out of loop devices"
863 print "out of loop devices"
866 # undo loop assignment
867 def clean_loop(file):
868 dev = find_assigned_loop(file)
870 ret, out = run('losetup -d', dev)
872 log('unable to clean loop device:', dev, 'for file:', file)
875 # determine if dev is formatted as a <fstype> filesystem
876 def need_format(fstype, dev):
877 # FIXME don't know how to implement this
880 # initialize a block device if needed
881 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
882 inode_size, mkfsoptions, backfstype, backdev):
886 if fstype == 'smfs' or not is_block(dev):
887 dev = init_loop(dev, size, fstype, journal_size, inode_size,
888 mkfsoptions, reformat, autoformat, backfstype, backdev)
889 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
890 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
893 # panic("device:", dev,
894 # "not prepared, and autoformat is not set.\n",
895 # "Rerun with --reformat option to format ALL filesystems")
900 """lookup IP address for an interface"""
901 rc, out = run("/sbin/ifconfig", iface)
904 addr = string.split(out[1])[1]
905 ip = string.split(addr, ':')[1]
908 def def_mount_options(fstype, target):
909 """returns deafult mount options for passed fstype and target (mds, ost)"""
910 if fstype == 'ext3' or fstype == 'ldiskfs':
911 mountfsoptions = "errors=remount-ro"
912 if target == 'ost' and sys_get_branch() == '2.4':
913 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
914 return mountfsoptions
917 def sys_get_elan_position_file():
918 procfiles = ["/proc/elan/device0/position",
919 "/proc/qsnet/elan4/device0/position",
920 "/proc/qsnet/elan3/device0/position"]
922 if os.access(p, os.R_OK):
926 def sys_get_local_nid(net_type, wildcard, cluster_id):
927 """Return the local nid."""
929 if sys_get_elan_position_file():
930 local = sys_get_local_address('elan', '*', cluster_id)
932 local = sys_get_local_address(net_type, wildcard, cluster_id)
935 def sys_get_local_address(net_type, wildcard, cluster_id):
936 """Return the local address for the network type."""
938 if net_type in ('tcp',):
940 iface, star = string.split(wildcard, ':')
941 local = if2addr(iface)
943 panic ("unable to determine ip for:", wildcard)
945 host = socket.gethostname()
946 local = socket.gethostbyname(host)
947 elif net_type == 'elan':
948 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
949 f = sys_get_elan_position_file()
951 panic ("unable to determine local Elan ID")
954 lines = fp.readlines()
962 nid = my_int(cluster_id) + my_int(elan_id)
964 except ValueError, e:
968 elif net_type == 'gm':
969 fixme("automatic local address for GM")
973 def sys_get_branch():
974 """Returns kernel release"""
976 fp = open('/proc/sys/kernel/osrelease')
977 lines = fp.readlines()
981 version = string.split(l)
982 a = string.split(version[0], '.')
983 return a[0] + '.' + a[1]
989 def mod_loaded(modname):
990 """Check if a module is already loaded. Look in /proc/modules for it."""
992 fp = open('/proc/modules')
993 lines = fp.readlines()
995 # please forgive my tired fingers for this one
996 ret = filter(lambda word, mod=modname: word == mod,
997 map(lambda line: string.split(line)[0], lines))
1002 # XXX: instead of device_list, ask for $name and see what we get
1003 def is_prepared(name):
1004 """Return true if a device exists for the name"""
1005 if config.lctl_dump:
1007 if (config.noexec or config.record) and config.cleanup:
1010 # expect this format:
1011 # 1 UP ldlm ldlm ldlm_UUID 2
1012 out = lctl.device_list()
1014 if name == string.split(s)[3]:
1016 except CommandError, e:
1020 def is_network_prepared():
1021 """If the any device exists, then assume that all networking
1022 has been configured"""
1023 out = lctl.device_list()
1026 def fs_is_mounted(path):
1027 """Return true if path is a mounted lustre filesystem"""
1029 fp = open('/proc/mounts')
1030 lines = fp.readlines()
1034 if a[1] == path and a[2] == 'lustre_lite':
1042 """Manage kernel modules"""
1043 def __init__(self, lustre_dir, portals_dir):
1044 self.lustre_dir = lustre_dir
1045 self.portals_dir = portals_dir
1046 self.kmodule_list = []
1048 def add_portals_module(self, dev_dir, modname):
1049 """Append a module to list of modules to load."""
1050 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1052 def add_lustre_module(self, dev_dir, modname):
1053 """Append a module to list of modules to load."""
1054 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1056 def load_module(self):
1057 """Load all the modules in the list in the order they appear."""
1058 for src_dir, dev_dir, mod in self.kmodule_list:
1059 if mod_loaded(mod) and not config.noexec:
1061 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1063 module = find_module(src_dir, dev_dir, mod)
1065 panic('module not found:', mod)
1066 (rc, out) = run('/sbin/insmod', module)
1068 raise CommandError('insmod', out, rc)
1070 (rc, out) = run('/sbin/modprobe', mod)
1072 raise CommandError('modprobe', out, rc)
1074 def cleanup_module(self):
1075 """Unload the modules in the list in reverse order."""
1076 rev = self.kmodule_list
1078 for src_dir, dev_dir, mod in rev:
1079 if not mod_loaded(mod) and not config.noexec:
1082 if mod == 'portals' and config.dump:
1083 lctl.dump(config.dump)
1084 log('unloading module:', mod)
1085 (rc, out) = run('/sbin/rmmod', mod)
1087 log('! unable to unload module:', mod)
1090 # ============================================================
1091 # Classes to prepare and cleanup the various objects
1094 """ Base class for the rest of the modules. The default cleanup method is
1095 defined here, as well as some utilitiy funcs.
1097 def __init__(self, module_name, db):
1099 self.module_name = module_name
1100 self.name = self.db.getName()
1101 self.uuid = self.db.getUUID()
1104 self.kmod = kmod(config.lustre, config.portals)
1106 def info(self, *args):
1107 msg = string.join(map(str,args))
1108 print self.module_name + ":", self.name, self.uuid, msg
1111 """ default cleanup, used for most modules """
1114 lctl.cleanup(self.name, self.uuid, config.force)
1115 except CommandError, e:
1116 log(self.module_name, "cleanup failed: ", self.name)
1120 def add_portals_module(self, dev_dir, modname):
1121 """Append a module to list of modules to load."""
1122 self.kmod.add_portals_module(dev_dir, modname)
1124 def add_lustre_module(self, dev_dir, modname):
1125 """Append a module to list of modules to load."""
1126 self.kmod.add_lustre_module(dev_dir, modname)
1128 def load_module(self):
1129 """Load all the modules in the list in the order they appear."""
1130 self.kmod.load_module()
1132 def cleanup_module(self):
1133 """Unload the modules in the list in reverse order."""
1134 if self.safe_to_clean():
1135 self.kmod.cleanup_module()
1137 def safe_to_clean(self):
1140 def safe_to_clean_modules(self):
1141 return self.safe_to_clean()
1143 class Network(Module):
1144 def __init__(self,db):
1145 Module.__init__(self, 'NETWORK', db)
1146 self.net_type = self.db.get_val('nettype')
1147 self.nid = self.db.get_val('nid', '*')
1148 self.cluster_id = self.db.get_val('clusterid', "0")
1149 self.port = self.db.get_val_int('port', 0)
1150 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1151 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1152 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1155 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1157 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1158 self.generic_nid = 1
1159 debug("nid:", self.nid)
1161 self.generic_nid = 0
1163 self.nid_uuid = self.nid_to_uuid(self.nid)
1165 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1166 if '*' in self.hostaddr:
1167 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1168 if not self.hostaddr:
1169 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1170 debug("hostaddr:", self.hostaddr)
1172 self.add_portals_module("libcfs", 'libcfs')
1173 self.add_portals_module("portals", 'portals')
1174 if node_needs_router():
1175 self.add_portals_module("router", 'kptlrouter')
1176 if self.net_type == 'tcp':
1177 self.add_portals_module("knals/socknal", 'ksocknal')
1178 if self.net_type == 'elan':
1179 self.add_portals_module("knals/qswnal", 'kqswnal')
1180 if self.net_type == 'gm':
1181 self.add_portals_module("knals/gmnal", 'kgmnal')
1183 def nid_to_uuid(self, nid):
1184 return "NID_%s_UUID" %(nid,)
1187 if not config.record and is_network_prepared():
1189 self.info(self.net_type, self.nid, self.port)
1190 if not (config.record and self.generic_nid):
1191 lctl.network(self.net_type, self.nid)
1192 if self.net_type == 'tcp':
1194 if self.net_type == 'elan':
1196 if self.port and node_is_router():
1197 run_one_acceptor(self.port)
1198 self.connect_peer_gateways()
1200 def connect_peer_gateways(self):
1201 for router in self.db.lookup_class('node'):
1202 if router.get_val_int('router', 0):
1203 for netuuid in router.get_networks():
1204 net = self.db.lookup(netuuid)
1206 if (gw.cluster_id == self.cluster_id and
1207 gw.net_type == self.net_type):
1208 if gw.nid != self.nid:
1211 def disconnect_peer_gateways(self):
1212 for router in self.db.lookup_class('node'):
1213 if router.get_val_int('router', 0):
1214 for netuuid in router.get_networks():
1215 net = self.db.lookup(netuuid)
1217 if (gw.cluster_id == self.cluster_id and
1218 gw.net_type == self.net_type):
1219 if gw.nid != self.nid:
1222 except CommandError, e:
1223 print "disconnect failed: ", self.name
1227 def safe_to_clean(self):
1228 return not is_network_prepared()
1231 self.info(self.net_type, self.nid, self.port)
1233 stop_acceptor(self.port)
1234 if node_is_router():
1235 self.disconnect_peer_gateways()
1237 def correct_level(self, level, op=None):
1240 class RouteTable(Module):
1241 def __init__(self,db):
1242 Module.__init__(self, 'ROUTES', db)
1244 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1246 # only setup connections for tcp NALs
1248 if not net_type in ('tcp',):
1251 # connect to target if route is to single node and this node is the gw
1252 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1253 if not local_cluster(net_type, tgt_cluster_id):
1254 panic("target", lo, " not on the local cluster")
1255 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1256 # connect to gateway if this node is not the gw
1257 elif (local_cluster(net_type, gw_cluster_id)
1258 and not local_interface(net_type, gw_cluster_id, gw)):
1259 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1264 panic("no server for nid", lo)
1267 return Network(srvdb)
1270 if not config.record and is_network_prepared():
1273 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1274 lctl.add_route(net_type, gw, lo, hi)
1275 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1279 def safe_to_clean(self):
1280 return not is_network_prepared()
1283 if is_network_prepared():
1284 # the network is still being used, don't clean it up
1286 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1287 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1290 lctl.disconnect(srv)
1291 except CommandError, e:
1292 print "disconnect failed: ", self.name
1297 lctl.del_route(net_type, gw, lo, hi)
1298 except CommandError, e:
1299 print "del_route failed: ", self.name
1303 class Management(Module):
1304 def __init__(self, db):
1305 Module.__init__(self, 'MGMT', db)
1306 self.add_lustre_module('lvfs', 'lvfs')
1307 self.add_lustre_module('obdclass', 'obdclass')
1308 self.add_lustre_module('ptlrpc', 'ptlrpc')
1309 self.add_lustre_module('mgmt', 'mgmt_svc')
1312 if not config.record and is_prepared(self.name):
1315 lctl.newdev("mgmt", self.name, self.uuid)
1317 def safe_to_clean(self):
1321 if is_prepared(self.name):
1322 Module.cleanup(self)
1324 def correct_level(self, level, op=None):
1327 # This is only needed to load the modules; the LDLM device
1328 # is now created automatically.
1330 def __init__(self,db):
1331 Module.__init__(self, 'LDLM', db)
1332 self.add_lustre_module('lvfs', 'lvfs')
1333 self.add_lustre_module('obdclass', 'obdclass')
1334 self.add_lustre_module('ptlrpc', 'ptlrpc')
1342 def correct_level(self, level, op=None):
1347 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1348 Module.__init__(self, 'LOV', db)
1349 if name_override != None:
1350 self.name = "lov_%s" % name_override
1351 self.add_lustre_module('lov', 'lov')
1352 self.mds_uuid = self.db.get_first_ref('mds')
1353 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1354 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1355 self.pattern = self.db.get_val_int('stripepattern', 0)
1356 self.devlist = self.db.get_lov_tgts('lov_tgt')
1357 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1359 self.desc_uuid = self.uuid
1360 self.uuid = generate_client_uuid(self.name)
1361 self.fs_name = fs_name
1363 self.config_only = 1
1365 self.config_only = None
1366 mds = self.db.lookup(self.mds_uuid)
1367 self.mds_name = mds.getName()
1368 for (obd_uuid, index, gen, active) in self.devlist:
1371 obd = self.db.lookup(obd_uuid)
1372 osc = get_osc(obd, self.uuid, fs_name)
1374 self.osclist.append((osc, index, gen, active))
1376 panic('osc not found:', obd_uuid)
1382 if not config.record and is_prepared(self.name):
1384 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1385 self.stripe_off, self.pattern, self.devlist,
1387 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1388 self.stripe_sz, self.stripe_off, self.pattern)
1389 for (osc, index, gen, active) in self.osclist:
1390 target_uuid = osc.target_uuid
1392 # Only ignore connect failures with --force, which
1393 # isn't implemented here yet.
1395 osc.prepare(ignore_connect_failure=0)
1396 except CommandError, e:
1397 print "Error preparing OSC %s\n" % osc.uuid
1399 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1402 for (osc, index, gen, active) in self.osclist:
1403 target_uuid = osc.target_uuid
1405 if is_prepared(self.name):
1406 Module.cleanup(self)
1407 if self.config_only:
1408 panic("Can't clean up config_only LOV ", self.name)
1410 def load_module(self):
1411 if self.config_only:
1412 panic("Can't load modules for config_only LOV ", self.name)
1413 for (osc, index, gen, active) in self.osclist:
1416 Module.load_module(self)
1418 def cleanup_module(self):
1419 if self.config_only:
1420 panic("Can't cleanup modules for config_only LOV ", self.name)
1421 Module.cleanup_module(self)
1422 for (osc, index, gen, active) in self.osclist:
1424 osc.cleanup_module()
1427 def correct_level(self, level, op=None):
1431 def __init__(self, db, uuid, fs_name, name_override = None):
1432 Module.__init__(self, 'LMV', db)
1433 if name_override != None:
1434 self.name = "lmv_%s" % name_override
1435 self.add_lustre_module('lmv', 'lmv')
1436 self.devlist = self.db.get_refs('mds')
1438 self.desc_uuid = self.uuid
1440 self.fs_name = fs_name
1441 for mds_uuid in self.devlist:
1442 mds = self.db.lookup(mds_uuid)
1444 panic("MDS not found!")
1445 mdc = MDC(mds, self.uuid, fs_name)
1447 self.mdclist.append(mdc)
1449 panic('mdc not found:', mds_uuid)
1452 if is_prepared(self.name):
1454 for mdc in self.mdclist:
1456 # Only ignore connect failures with --force, which
1457 # isn't implemented here yet.
1458 mdc.prepare(ignore_connect_failure=0)
1459 except CommandError, e:
1460 print "Error preparing LMV %s\n" % mdc.uuid
1462 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1463 string.join(self.devlist))
1466 for mdc in self.mdclist:
1468 if is_prepared(self.name):
1469 Module.cleanup(self)
1471 def load_module(self):
1472 for mdc in self.mdclist:
1475 Module.load_module(self)
1477 def cleanup_module(self):
1478 Module.cleanup_module(self)
1479 for mds in self.mdclist:
1480 mdc.cleanup_module()
1483 def correct_level(self, level, op=None):
1486 class MDSDEV(Module):
1487 def __init__(self,db):
1488 Module.__init__(self, 'MDSDEV', db)
1489 self.devpath = self.db.get_val('devpath','')
1490 self.backdevpath = self.db.get_val('backdevpath','')
1491 self.size = self.db.get_val_int('devsize', 0)
1492 self.journal_size = self.db.get_val_int('journalsize', 0)
1493 self.fstype = self.db.get_val('fstype', '')
1494 self.backfstype = self.db.get_val('backfstype', '')
1495 self.nspath = self.db.get_val('nspath', '')
1496 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1497 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1498 self.cachetype = self.db.get_val('cachetype', '')
1499 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1500 target_uuid = self.db.get_first_ref('target')
1501 mds = self.db.lookup(target_uuid)
1502 self.name = mds.getName()
1503 self.filesystem_uuids = mds.get_refs('filesystem')
1506 self.master_mds = ""
1507 if not self.filesystem_uuids:
1508 self.lmv_uuid = self.db.get_first_ref('lmv')
1509 if not self.lmv_uuid:
1510 panic("ALERT: can't find lvm uuid")
1512 self.lmv = self.db.lookup(self.lmv_uuid)
1514 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1515 self.master_mds = self.lmv_uuid
1516 # FIXME: if fstype not set, then determine based on kernel version
1517 self.format = self.db.get_val('autoformat', "no")
1518 if mds.get_val('failover', 0):
1519 self.failover_mds = 'f'
1521 self.failover_mds = 'n'
1522 active_uuid = get_active_target(mds)
1524 panic("No target device found:", target_uuid)
1525 if active_uuid == self.uuid:
1529 if self.active and config.group and config.group != mds.get_val('group'):
1532 self.inode_size = self.db.get_val_int('inodesize', 0)
1533 if self.inode_size == 0:
1534 # find the LOV for this MDS
1535 lovconfig_uuid = mds.get_first_ref('lovconfig')
1536 if not lovconfig_uuid:
1537 if not self.lmv_uuid:
1538 panic("No LOV found for lovconfig ", lovconfig.name)
1541 panic("No LMV initialized and not lovconfig_uuid found")
1543 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1544 lovconfig = self.lmv.lookup(lovconfig_uuid)
1545 lov_uuid = lovconfig.get_first_ref('lov')
1547 panic("No LOV found for lovconfig ", lovconfig.name)
1549 lovconfig = mds.lookup(lovconfig_uuid)
1550 lov_uuid = lovconfig.get_first_ref('lov')
1552 panic("No LOV found for lovconfig ", lovconfig.name)
1555 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1556 lovconfig = self.lmv.lookup(lovconfig_uuid)
1557 lov_uuid = lovconfig.get_first_ref('lov')
1559 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1561 # default stripe count controls default inode_size
1562 stripe_count = lov.stripe_cnt
1563 if stripe_count > 77:
1564 self.inode_size = 4096
1565 elif stripe_count > 35:
1566 self.inode_size = 2048
1567 elif stripe_count > 13:
1568 self.inode_size = 1024
1569 elif stripe_count > 3:
1570 self.inode_size = 512
1572 self.inode_size = 256
1574 self.target_dev_uuid = self.uuid
1575 self.uuid = target_uuid
1578 client_uuid = generate_client_uuid(self.name)
1579 client_uuid = self.name + "_lmv_" + "UUID"
1580 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1581 self.master_mds = self.master.name
1584 self.add_lustre_module('mdc', 'mdc')
1585 self.add_lustre_module('osc', 'osc')
1586 self.add_lustre_module('lov', 'lov')
1587 self.add_lustre_module('lmv', 'lmv')
1588 self.add_lustre_module('ost', 'ost')
1589 self.add_lustre_module('mds', 'mds')
1591 if self.fstype == 'smfs':
1592 self.add_lustre_module('smfs', 'smfs')
1594 if self.fstype == 'ldiskfs':
1595 self.add_lustre_module('ldiskfs', 'ldiskfs')
1598 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1600 # if fstype is smfs, then we should also take care about backing
1602 if self.fstype == 'smfs':
1603 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1605 for options in string.split(self.mountfsoptions, ','):
1606 if options == 'snap':
1607 if not self.fstype == 'smfs':
1608 panic("mountoptions with snap, but fstype is not smfs\n")
1609 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1610 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1611 def load_module(self):
1613 Module.load_module(self)
1616 if not config.record and is_prepared(self.name):
1619 debug(self.uuid, "not active")
1622 # run write_conf automatically, if --reformat used
1624 self.info(self.devpath, self.fstype, self.size, self.format)
1628 self.master.prepare()
1629 # never reformat here
1630 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1631 self.format, self.journal_size, self.inode_size,
1632 self.mkfsoptions, self.backfstype, self.backdevpath)
1634 if not is_prepared('MDT'):
1635 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1637 mountfsoptions = def_mount_options(self.fstype, 'mds')
1639 if config.mountfsoptions:
1641 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1643 mountfsoptions = config.mountfsoptions
1644 if self.mountfsoptions:
1645 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1647 if self.mountfsoptions:
1649 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1651 mountfsoptions = self.mountfsoptions
1653 if self.fstype == 'smfs':
1654 realdev = self.fstype
1657 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1661 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1666 print 'MDS mount options: ' + mountfsoptions
1668 if not self.master_mds:
1669 self.master_mds = 'dumb'
1670 if not self.cachetype:
1671 self.cachetype = 'dumb'
1672 lctl.newdev("mds", self.name, self.uuid,
1673 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1674 self.name, mountfsoptions,
1675 self.master_mds, self.cachetype))
1676 except CommandError, e:
1678 panic("MDS is missing the config log. Need to run " +
1679 "lconf --write_conf.")
1683 def write_conf(self):
1685 if not is_prepared(self.name):
1686 self.info(self.devpath, self.fstype, self.format)
1688 blkdev = block_dev(self.devpath, self.size, self.fstype,
1689 config.reformat, self.format, self.journal_size,
1690 self.inode_size, self.mkfsoptions,
1691 self.backfstype, self.backdevpath)
1693 # Even for writing logs we mount mds with supplied mount options
1694 # because it will not mount smfs (if used) otherwise.
1696 mountfsoptions = def_mount_options(self.fstype, 'mds')
1698 if config.mountfsoptions:
1700 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1702 mountfsoptions = config.mountfsoptions
1703 if self.mountfsoptions:
1704 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1706 if self.mountfsoptions:
1708 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1710 mountfsoptions = self.mountfsoptions
1712 if self.fstype == 'smfs':
1713 realdev = self.fstype
1716 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1720 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1725 print 'MDS mount options: ' + mountfsoptions
1727 # As mount options are passed by 4th param to config tool, we need
1728 # to pass something in 3rd param. But we do not want this 3rd param
1729 # be counted as a profile name for reading log on MDS setup, thus,
1730 # we pass there some predefined sign like 'dumb', which will be
1731 # checked in MDS code and skipped. Probably there is more nice way
1732 # like pass empty string and check it in config tool and pass null
1734 lctl.newdev("mds", self.name, self.uuid,
1735 setup ="%s %s %s %s" %(realdev, self.fstype,
1736 'dumb', mountfsoptions))
1739 # record logs for the MDS lov
1740 for uuid in self.filesystem_uuids:
1741 log("recording clients for filesystem:", uuid)
1742 fs = self.db.lookup(uuid)
1744 # this is ugly, should be organized nice later.
1745 target_uuid = self.db.get_first_ref('target')
1746 mds = self.db.lookup(target_uuid)
1748 lovconfig_uuid = mds.get_first_ref('lovconfig')
1750 lovconfig = mds.lookup(lovconfig_uuid)
1751 obd_uuid = lovconfig.get_first_ref('lov')
1753 obd_uuid = fs.get_first_ref('obd')
1755 client_uuid = generate_client_uuid(self.name)
1756 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1759 lctl.clear_log(self.name, self.name)
1760 lctl.record(self.name, self.name)
1762 lctl.mount_option(self.name, client.get_name(), "")
1764 process_updates(self.db, self.name, self.name, client)
1767 lctl.clear_log(self.name, self.name + '-clean')
1768 lctl.record(self.name, self.name + '-clean')
1770 lctl.del_mount_option(self.name)
1772 process_updates(self.db, self.name, self.name + '-clean', client)
1776 # record logs for each client
1782 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1784 config_options = CONFIG_FILE
1786 for node_db in self.db.lookup_class('node'):
1787 client_name = node_db.getName()
1788 for prof_uuid in node_db.get_refs('profile'):
1789 prof_db = node_db.lookup(prof_uuid)
1790 # refactor this into a funtion to test "clientness"
1792 for ref_class, ref_uuid in prof_db.get_all_refs():
1793 if ref_class in ('mountpoint','echoclient'):
1794 debug("recording", client_name)
1795 old_noexec = config.noexec
1797 ret, out = run (sys.argv[0], noexec_opt,
1798 " -v --record --nomod",
1799 "--record_log", client_name,
1800 "--record_device", self.name,
1801 "--node", client_name,
1804 for s in out: log("record> ", string.strip(s))
1805 ret, out = run (sys.argv[0], noexec_opt,
1806 "--cleanup -v --record --nomod",
1807 "--record_log", client_name + "-clean",
1808 "--record_device", self.name,
1809 "--node", client_name,
1812 for s in out: log("record> ", string.strip(s))
1813 config.noexec = old_noexec
1816 lctl.cleanup(self.name, self.uuid, 0, 0)
1817 except CommandError, e:
1818 log(self.module_name, "cleanup failed: ", self.name)
1821 Module.cleanup(self)
1823 if self.fstype == 'smfs':
1824 clean_loop(self.backdevpath)
1826 clean_loop(self.devpath)
1828 def msd_remaining(self):
1829 out = lctl.device_list()
1831 if string.split(s)[2] in ('mds',):
1834 def safe_to_clean(self):
1837 def safe_to_clean_modules(self):
1838 return not self.msd_remaining()
1842 debug(self.uuid, "not active")
1845 if is_prepared(self.name):
1847 lctl.cleanup(self.name, self.uuid, config.force,
1849 except CommandError, e:
1850 log(self.module_name, "cleanup failed: ", self.name)
1853 Module.cleanup(self)
1856 self.master.cleanup()
1857 if not self.msd_remaining() and is_prepared('MDT'):
1859 lctl.cleanup("MDT", "MDT_UUID", config.force,
1861 except CommandError, e:
1862 print "cleanup failed: ", self.name
1866 if self.fstype == 'smfs':
1867 clean_loop(self.backdevpath)
1869 clean_loop(self.devpath)
1871 def correct_level(self, level, op=None):
1872 #if self.master_mds:
1877 def __init__(self, db):
1878 Module.__init__(self, 'OSD', db)
1879 self.osdtype = self.db.get_val('osdtype')
1880 self.devpath = self.db.get_val('devpath', '')
1881 self.backdevpath = self.db.get_val('backdevpath', '')
1882 self.size = self.db.get_val_int('devsize', 0)
1883 self.journal_size = self.db.get_val_int('journalsize', 0)
1884 self.inode_size = self.db.get_val_int('inodesize', 0)
1885 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1886 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1887 self.fstype = self.db.get_val('fstype', '')
1888 self.backfstype = self.db.get_val('backfstype', '')
1889 self.nspath = self.db.get_val('nspath', '')
1890 target_uuid = self.db.get_first_ref('target')
1891 ost = self.db.lookup(target_uuid)
1892 self.name = ost.getName()
1893 self.format = self.db.get_val('autoformat', 'yes')
1894 if ost.get_val('failover', 0):
1895 self.failover_ost = 'f'
1897 self.failover_ost = 'n'
1899 active_uuid = get_active_target(ost)
1901 panic("No target device found:", target_uuid)
1902 if active_uuid == self.uuid:
1906 if self.active and config.group and config.group != ost.get_val('group'):
1909 self.target_dev_uuid = self.uuid
1910 self.uuid = target_uuid
1912 self.add_lustre_module('ost', 'ost')
1913 if self.fstype == 'smfs':
1914 self.add_lustre_module('smfs', 'smfs')
1915 # FIXME: should we default to ext3 here?
1916 if self.fstype == 'ldiskfs':
1917 self.add_lustre_module('ldiskfs', 'ldiskfs')
1919 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1920 if self.fstype == 'smfs':
1921 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1923 for options in self.mountfsoptions:
1924 if options == 'snap':
1925 if not self.fstype == 'smfs':
1926 panic("mountoptions with snap, but fstype is not smfs\n")
1927 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1928 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1930 self.add_lustre_module(self.osdtype, self.osdtype)
1932 def load_module(self):
1934 Module.load_module(self)
1936 # need to check /proc/mounts and /etc/mtab before
1937 # formatting anything.
1938 # FIXME: check if device is already formatted.
1940 if is_prepared(self.name):
1943 debug(self.uuid, "not active")
1945 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1946 self.format, self.journal_size, self.inode_size)
1948 if self.osdtype == 'obdecho':
1951 blkdev = block_dev(self.devpath, self.size, self.fstype,
1952 config.reformat, self.format, self.journal_size,
1953 self.inode_size, self.mkfsoptions, self.backfstype,
1956 mountfsoptions = def_mount_options(self.fstype, 'ost')
1958 if config.mountfsoptions:
1960 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1962 mountfsoptions = config.mountfsoptions
1963 if self.mountfsoptions:
1964 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1966 if self.mountfsoptions:
1968 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1970 mountfsoptions = self.mountfsoptions
1972 if self.fstype == 'smfs':
1973 realdev = self.fstype
1976 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1980 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1985 print 'OSD mount options: ' + mountfsoptions
1987 lctl.newdev(self.osdtype, self.name, self.uuid,
1988 setup ="%s %s %s %s" %(realdev, self.fstype,
1991 if not is_prepared('OSS'):
1992 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1994 def osd_remaining(self):
1995 out = lctl.device_list()
1997 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2000 def safe_to_clean(self):
2003 def safe_to_clean_modules(self):
2004 return not self.osd_remaining()
2008 debug(self.uuid, "not active")
2010 if is_prepared(self.name):
2013 lctl.cleanup(self.name, self.uuid, config.force,
2015 except CommandError, e:
2016 log(self.module_name, "cleanup failed: ", self.name)
2019 if not self.osd_remaining() and is_prepared('OSS'):
2021 lctl.cleanup("OSS", "OSS_UUID", config.force,
2023 except CommandError, e:
2024 print "cleanup failed: ", self.name
2027 if not self.osdtype == 'obdecho':
2028 if self.fstype == 'smfs':
2029 clean_loop(self.backdevpath)
2031 clean_loop(self.devpath)
2033 def correct_level(self, level, op=None):
2036 def mgmt_uuid_for_fs(mtpt_name):
2039 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2040 fs_uuid = mtpt_db.get_first_ref('filesystem')
2041 fs = toplustreDB.lookup(fs_uuid)
2044 return fs.get_first_ref('mgmt')
2046 # Generic client module, used by OSC and MDC
2047 class Client(Module):
2048 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2050 self.target_name = tgtdb.getName()
2051 self.target_uuid = tgtdb.getUUID()
2055 self.tgt_dev_uuid = get_active_target(tgtdb)
2056 if not self.tgt_dev_uuid:
2057 panic("No target device found for target(1):", self.target_name)
2059 self.kmod = kmod(config.lustre, config.portals)
2063 self.module = module
2064 self.module_name = string.upper(module)
2066 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2067 self.target_name, fs_name)
2069 self.name = self_name
2071 self.lookup_server(self.tgt_dev_uuid)
2072 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2074 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2077 self.fs_name = fs_name
2080 self.add_lustre_module(module_dir, module)
2082 def lookup_server(self, srv_uuid):
2083 """ Lookup a server's network information """
2084 self._server_nets = get_ost_net(self.db, srv_uuid)
2085 if len(self._server_nets) == 0:
2086 panic ("Unable to find a server for:", srv_uuid)
2089 def get_servers(self):
2090 return self._server_nets
2092 def prepare(self, ignore_connect_failure = 0):
2093 self.info(self.target_uuid)
2094 if not config.record and is_prepared(self.name):
2097 srv = choose_local_server(self.get_servers())
2101 routes = find_route(self.get_servers())
2102 if len(routes) == 0:
2103 panic ("no route to", self.target_uuid)
2104 for (srv, r) in routes:
2105 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2106 except CommandError, e:
2107 if not ignore_connect_failure:
2110 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2111 debug("%s inactive" % self.target_uuid)
2112 inactive_p = "inactive"
2114 debug("%s active" % self.target_uuid)
2116 lctl.newdev(self.module, self.name, self.uuid,
2117 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2118 inactive_p, self.mgmt_name))
2121 if is_prepared(self.name):
2122 Module.cleanup(self)
2124 srv = choose_local_server(self.get_servers())
2126 lctl.disconnect(srv)
2128 for (srv, r) in find_route(self.get_servers()):
2129 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2130 except CommandError, e:
2131 log(self.module_name, "cleanup failed: ", self.name)
2135 def correct_level(self, level, op=None):
2138 def deactivate(self):
2140 lctl.deactivate(self.name)
2141 except CommandError, e:
2142 log(self.module_name, "deactivate failed: ", self.name)
2147 def __init__(self, db, uuid, fs_name):
2148 Client.__init__(self, db, uuid, 'mdc', fs_name)
2150 def permits_inactive(self):
2154 def __init__(self, db, uuid, fs_name):
2155 Client.__init__(self, db, uuid, 'osc', fs_name)
2157 def permits_inactive(self):
2160 def mgmtcli_name_for_uuid(uuid):
2161 return 'MGMTCLI_%s' % uuid
2163 class ManagementClient(Client):
2164 def __init__(self, db, uuid):
2165 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2166 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2167 module_dir = 'mgmt')
2169 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
2170 Module.__init__(self, 'VLOV', db)
2171 if name_override != None:
2172 self.name = "lov_%s" % name_override
2173 self.add_lustre_module('lov', 'lov')
2174 self.stripe_sz = 65536
2178 self.desc_uuid = self.uuid
2179 self.uuid = generate_client_uuid(self.name)
2180 self.fs_name = fs_name
2181 self.osc = get_osc(db, self.uuid, fs_name)
2183 panic('osc not found:', self.uuid)
2185 self.config_only = 1
2187 self.config_only = None
2193 if not config.record and is_prepared(self.name):
2195 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
2196 self.stripe_sz, self.stripe_off, self.pattern)
2197 target_uuid = self.osc.target_uuid
2200 self.osc.prepare(ignore_connect_failure=0)
2201 except CommandError, e:
2202 print "Error preparing OSC %s\n" % osc.uuid
2204 lctl.lov_add_obd(self.name, self.uuid, target_uuid, 0, 1)
2207 target_uuid = self.osc.target_uuid
2209 if is_prepared(self.name):
2210 Module.cleanup(self)
2211 if self.config_only:
2212 panic("Can't clean up config_only LOV ", self.name)
2214 def load_module(self):
2215 if self.config_only:
2216 panic("Can't load modules for config_only LOV ", self.name)
2217 self.osc.load_module()
2218 Module.load_module(self)
2220 def cleanup_module(self):
2221 if self.config_only:
2222 panic("Can't cleanup modules for config_only LOV ", self.name)
2223 Module.cleanup_module(self)
2224 self.osc.cleanup_module()
2226 def correct_level(self, level, op=None):
2229 class CMOBD(Module):
2230 def __init__(self,db):
2231 Module.__init__(self, 'CMOBD', db)
2232 self.name = self.db.getName();
2233 self.uuid = generate_client_uuid(self.name)
2234 self.master_uuid = self.db.get_first_ref('masterobd')
2235 self.cache_uuid = self.db.get_first_ref('cacheobd')
2236 self.add_lustre_module('cmobd', 'cmobd')
2237 master_obd = self.db.lookup(self.master_uuid)
2239 panic('master obd not found:', self.master_uuid)
2240 cache_obd = self.db.lookup(self.cache_uuid)
2242 panic('cache obd not found:', self.cache_uuid)
2244 if master_obd.get_class() == 'ost':
2245 self.client_uuid = generate_client_uuid(self.name)
2246 self.master= VLOV(master_obd, self.client_uuid, self.name,
2247 "%s_master" % (self.name))
2248 self.master_uuid = self.master.get_uuid()
2250 self.master = get_mdc(db, self.name, self.master_uuid)
2251 # need to check /proc/mounts and /etc/mtab before
2252 # formatting anything.
2253 # FIXME: check if device is already formatted.
2255 self.master.prepare()
2256 if not config.record and is_prepared(self.name):
2258 self.info(self.master_uuid, self.cache_uuid)
2259 lctl.newdev("cmobd", self.name, self.uuid,
2260 setup ="%s %s" %(self.master_uuid,
2264 if is_prepared(self.name):
2265 Module.cleanup(self)
2266 self.master.cleanup()
2268 def load_module(self):
2269 self.master.load_module()
2270 Module.load_module(self)
2272 def cleanup_module(self):
2273 Module.cleanup_module(self)
2274 self.master.cleanup_module()
2276 def correct_level(self, level, op=None):
2280 def __init__(self, db, uuid, name, type, name_override = None):
2281 Module.__init__(self, 'COBD', db)
2282 self.name = self.db.getName();
2283 self.uuid = generate_client_uuid(self.name)
2284 self.real_uuid = self.db.get_first_ref('realobd')
2285 self.cache_uuid = self.db.get_first_ref('cacheobd')
2286 self.add_lustre_module('cobd', 'cobd')
2287 real_obd = self.db.lookup(self.real_uuid)
2289 panic('real obd not found:', self.real_uuid)
2290 cache_obd = self.db.lookup(self.cache_uuid)
2292 panic('cache obd not found:', self.cache_uuid)
2294 self.real = LOV(real_obd, self.real_uuid, name,
2295 "%s_real" % (self.name));
2296 self.cache = LOV(cache_obd, self.cache_uuid, name,
2297 "%s_cache" % (self.name));
2299 self.real = get_mdc(db, name, self.real_uuid)
2300 self.cache = get_mdc(db, name, self.cache_uuid)
2301 # need to check /proc/mounts and /etc/mtab before
2302 # formatting anything.
2303 # FIXME: check if device is already formatted.
2308 def get_real_name(self):
2309 return self.real.name
2310 def get_cache_name(self):
2311 return self.cache.name
2314 self.cache.prepare()
2315 if not config.record and is_prepared(self.name):
2317 self.info(self.real_uuid, self.cache_uuid)
2318 lctl.newdev("cobd", self.name, self.uuid,
2319 setup ="%s %s" %(self.real.name,
2323 if is_prepared(self.name):
2324 Module.cleanup(self)
2326 self.cache.cleanup()
2328 def load_module(self):
2329 self.real.load_module()
2330 Module.load_module(self)
2332 def cleanup_module(self):
2333 Module.cleanup_module(self)
2334 self.real.cleanup_module()
2336 # virtual interface for OSC and LOV
2338 def __init__(self, db, client_uuid, name, name_override = None):
2339 Module.__init__(self, 'VOSC', db)
2340 if db.get_class() == 'lov':
2341 self.osc = LOV(db, client_uuid, name, name_override)
2343 elif db.get_class() == 'cobd':
2344 self.osc = COBD(db, client_uuid, name, 'obd')
2347 self.osc = OSC(db, client_uuid, name)
2350 return self.osc.get_uuid()
2352 return self.osc.get_name()
2357 def load_module(self):
2358 self.osc.load_module()
2359 def cleanup_module(self):
2360 self.osc.cleanup_module()
2361 def correct_level(self, level, op=None):
2362 return self.osc.correct_level(level, op)
2364 # virtual interface for MDC and LMV
2366 def __init__(self, db, client_uuid, name, name_override = None):
2367 Module.__init__(self, 'VMDC', db)
2368 if db.get_class() == 'lmv':
2369 self.mdc = LMV(db, client_uuid, name)
2370 elif db.get_class() == 'cobd':
2371 self.mdc = COBD(db, client_uuid, name, 'mds')
2373 self.mdc = MDC(db, client_uuid, name)
2375 return self.mdc.uuid
2377 return self.mdc.name
2382 def load_module(self):
2383 self.mdc.load_module()
2384 def cleanup_module(self):
2385 self.mdc.cleanup_module()
2386 def correct_level(self, level, op=None):
2387 return self.mdc.correct_level(level, op)
2389 class ECHO_CLIENT(Module):
2390 def __init__(self,db):
2391 Module.__init__(self, 'ECHO_CLIENT', db)
2392 self.add_lustre_module('obdecho', 'obdecho')
2393 self.obd_uuid = self.db.get_first_ref('obd')
2394 obd = self.db.lookup(self.obd_uuid)
2395 self.uuid = generate_client_uuid(self.name)
2396 self.osc = VOSC(obd, self.uuid, self.name)
2399 if not config.record and is_prepared(self.name):
2402 self.osc.prepare() # XXX This is so cheating. -p
2403 self.info(self.obd_uuid)
2405 lctl.newdev("echo_client", self.name, self.uuid,
2406 setup = self.osc.get_name())
2409 if is_prepared(self.name):
2410 Module.cleanup(self)
2413 def load_module(self):
2414 self.osc.load_module()
2415 Module.load_module(self)
2417 def cleanup_module(self):
2418 Module.cleanup_module(self)
2419 self.osc.cleanup_module()
2421 def correct_level(self, level, op=None):
2424 def generate_client_uuid(name):
2425 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2427 int(random.random() * 1048576),
2428 int(random.random() * 1048576))
2429 return client_uuid[:36]
2431 class Mountpoint(Module):
2432 def __init__(self,db):
2433 Module.__init__(self, 'MTPT', db)
2434 self.path = self.db.get_val('path')
2435 self.clientoptions = self.db.get_val('clientoptions', '')
2436 self.fs_uuid = self.db.get_first_ref('filesystem')
2437 fs = self.db.lookup(self.fs_uuid)
2438 self.mds_uuid = fs.get_first_ref('lmv')
2439 if not self.mds_uuid:
2440 self.mds_uuid = fs.get_first_ref('mds')
2441 self.obd_uuid = fs.get_first_ref('obd')
2442 self.mgmt_uuid = fs.get_first_ref('mgmt')
2443 client_uuid = generate_client_uuid(self.name)
2445 ost = self.db.lookup(self.obd_uuid)
2447 panic("no ost: ", self.obd_uuid)
2449 mds = self.db.lookup(self.mds_uuid)
2451 panic("no mds: ", self.mds_uuid)
2453 self.add_lustre_module('mdc', 'mdc')
2454 self.add_lustre_module('lmv', 'lmv')
2455 self.add_lustre_module('llite', 'llite')
2457 self.vosc = VOSC(ost, client_uuid, self.name)
2458 self.vmdc = VMDC(mds, client_uuid, self.name)
2461 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2467 if not config.record and fs_is_mounted(self.path):
2468 log(self.path, "already mounted.")
2472 self.mgmtcli.prepare()
2475 vmdc_name = self.vmdc.get_name()
2477 self.info(self.path, self.mds_uuid, self.obd_uuid)
2478 if config.record or config.lctl_dump:
2479 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2482 if config.clientoptions:
2483 if self.clientoptions:
2484 self.clientoptions = self.clientoptions + ',' + \
2485 config.clientoptions
2487 self.clientoptions = config.clientoptions
2488 if self.clientoptions:
2489 self.clientoptions = ',' + self.clientoptions
2490 # Linux kernel will deal with async and not pass it to ll_fill_super,
2491 # so replace it with Lustre async
2492 self.clientoptions = string.replace(self.clientoptions, "async",
2495 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2496 (self.vosc.get_name(), vmdc_name, self.clientoptions,
2497 config.config, self.path)
2498 run("mkdir", self.path)
2503 panic("mount failed:", self.path, ":", string.join(val))
2506 self.info(self.path, self.mds_uuid,self.obd_uuid)
2508 if config.record or config.lctl_dump:
2509 lctl.del_mount_option(local_node_name)
2511 if fs_is_mounted(self.path):
2513 (rc, out) = run("umount", "-f", self.path)
2515 (rc, out) = run("umount", self.path)
2517 raise CommandError('umount', out, rc)
2519 if fs_is_mounted(self.path):
2520 panic("fs is still mounted:", self.path)
2525 self.mgmtcli.cleanup()
2527 def load_module(self):
2529 self.mgmtcli.load_module()
2530 self.vosc.load_module()
2531 Module.load_module(self)
2533 def cleanup_module(self):
2534 Module.cleanup_module(self)
2535 self.vosc.cleanup_module()
2537 self.mgmtcli.cleanup_module()
2539 def correct_level(self, level, op=None):
2542 # ============================================================
2543 # misc query functions
2545 def get_ost_net(self, osd_uuid):
2549 osd = self.lookup(osd_uuid)
2550 node_uuid = osd.get_first_ref('node')
2551 node = self.lookup(node_uuid)
2553 panic("unable to find node for osd_uuid:", osd_uuid,
2554 " node_ref:", node_uuid_)
2555 for net_uuid in node.get_networks():
2556 db = node.lookup(net_uuid)
2557 srv_list.append(Network(db))
2561 # the order of iniitailization is based on level.
2562 def getServiceLevel(self):
2563 type = self.get_class()
2565 if type in ('network',):
2567 elif type in ('routetbl',):
2569 elif type in ('ldlm',):
2571 elif type in ('mgmt',):
2573 elif type in ('osd', 'cobd'):
2575 elif type in ('mdsdev',):
2577 elif type in ('lmv',):
2579 elif type in ('cmobd',):
2581 elif type in ('mountpoint', 'echoclient'):
2584 panic("Unknown type: ", type)
2586 if ret < config.minlevel or ret > config.maxlevel:
2591 # return list of services in a profile. list is a list of tuples
2592 # [(level, db_object),]
2593 def getServices(self):
2595 for ref_class, ref_uuid in self.get_all_refs():
2596 servdb = self.lookup(ref_uuid)
2598 level = getServiceLevel(servdb)
2600 list.append((level, servdb))
2602 panic('service not found: ' + ref_uuid)
2608 ############################################################
2610 # FIXME: clean this mess up!
2612 # OSC is no longer in the xml, so we have to fake it.
2613 # this is getting ugly and begging for another refactoring
2614 def get_osc(ost_db, uuid, fs_name):
2615 osc = OSC(ost_db, uuid, fs_name)
2618 def get_mdc(db, fs_name, mds_uuid):
2619 mds_db = db.lookup(mds_uuid);
2621 error("no mds:", mds_uuid)
2622 mdc = MDC(mds_db, mds_uuid, fs_name)
2625 ############################################################
2626 # routing ("rooting")
2627 # list of (nettype, cluster_id, nid)
2630 def find_local_clusters(node_db):
2631 global local_clusters
2632 for netuuid in node_db.get_networks():
2633 net = node_db.lookup(netuuid)
2635 debug("add_local", netuuid)
2636 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2638 if acceptors.has_key(srv.port):
2639 panic("duplicate port:", srv.port)
2640 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2641 srv.send_mem, srv.recv_mem,
2644 # This node is a gateway.
2646 def node_is_router():
2649 # If there are any routers found in the config, then this will be true
2650 # and all nodes will load kptlrouter.
2652 def node_needs_router():
2653 return needs_router or is_router
2655 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2656 # Currently, these local routes are only added to kptlrouter route
2657 # table if they are needed to connect to a specific server. This
2658 # should be changed so all available routes are loaded, and the
2659 # ptlrouter can make all the decisions.
2662 def find_local_routes(lustre):
2663 """ Scan the lustre config looking for routers . Build list of
2665 global local_routes, needs_router
2667 list = lustre.lookup_class('node')
2669 if router.get_val_int('router', 0):
2671 for (local_type, local_cluster_id, local_nid) in local_clusters:
2673 for netuuid in router.get_networks():
2674 db = router.lookup(netuuid)
2675 if (local_type == db.get_val('nettype') and
2676 local_cluster_id == db.get_val('clusterid')):
2677 gw = db.get_val('nid')
2680 debug("find_local_routes: gw is", gw)
2681 for route in router.get_local_routes(local_type, gw):
2682 local_routes.append(route)
2683 debug("find_local_routes:", local_routes)
2686 def choose_local_server(srv_list):
2687 for srv in srv_list:
2688 if local_cluster(srv.net_type, srv.cluster_id):
2691 def local_cluster(net_type, cluster_id):
2692 for cluster in local_clusters:
2693 if net_type == cluster[0] and cluster_id == cluster[1]:
2697 def local_interface(net_type, cluster_id, nid):
2698 for cluster in local_clusters:
2699 if (net_type == cluster[0] and cluster_id == cluster[1]
2700 and nid == cluster[2]):
2704 def find_route(srv_list):
2706 frm_type = local_clusters[0][0]
2707 for srv in srv_list:
2708 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2709 to_type = srv.net_type
2711 cluster_id = srv.cluster_id
2712 debug ('looking for route to', to_type, to)
2713 for r in local_routes:
2714 debug("find_route: ", r)
2715 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2716 result.append((srv, r))
2719 def get_active_target(db):
2720 target_uuid = db.getUUID()
2721 target_name = db.getName()
2722 node_name = get_select(target_name)
2724 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2726 tgt_dev_uuid = db.get_first_ref('active')
2729 def get_server_by_nid_uuid(db, nid_uuid):
2730 for n in db.lookup_class("network"):
2732 if net.nid_uuid == nid_uuid:
2736 ############################################################
2740 type = db.get_class()
2741 debug('Service:', type, db.getName(), db.getUUID())
2746 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2747 elif type == 'network':
2749 elif type == 'routetbl':
2753 elif type == 'cobd':
2754 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2755 elif type == 'cmobd':
2757 elif type == 'mdsdev':
2759 elif type == 'mountpoint':
2761 elif type == 'echoclient':
2763 elif type == 'mgmt':
2768 panic ("unknown service type:", type)
2772 # Prepare the system to run lustre using a particular profile
2773 # in a the configuration.
2774 # * load & the modules
2775 # * setup networking for the current node
2776 # * make sure partitions are in place and prepared
2777 # * initialize devices with lctl
2778 # Levels is important, and needs to be enforced.
2779 def for_each_profile(db, prof_list, operation):
2780 for prof_uuid in prof_list:
2781 prof_db = db.lookup(prof_uuid)
2783 panic("profile:", profile, "not found.")
2784 services = getServices(prof_db)
2787 def magic_get_osc(db, rec, lov):
2789 lov_uuid = lov.get_uuid()
2790 lov_name = lov.osc.fs_name
2792 lov_uuid = rec.getAttribute('lov_uuidref')
2793 # FIXME: better way to find the mountpoint?
2794 filesystems = db.root_node.getElementsByTagName('filesystem')
2796 for fs in filesystems:
2797 ref = fs.getElementsByTagName('obd_ref')
2798 if ref[0].getAttribute('uuidref') == lov_uuid:
2799 fsuuid = fs.getAttribute('uuid')
2803 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2805 mtpts = db.root_node.getElementsByTagName('mountpoint')
2808 ref = fs.getElementsByTagName('filesystem_ref')
2809 if ref[0].getAttribute('uuidref') == fsuuid:
2810 lov_name = fs.getAttribute('name')
2814 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2816 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2818 ost_uuid = rec.getAttribute('ost_uuidref')
2819 obd = db.lookup(ost_uuid)
2822 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
2824 osc = get_osc(obd, lov_uuid, lov_name)
2826 panic('osc not found:', obd_uuid)
2829 # write logs for update records. sadly, logs of all types -- and updates in
2830 # particular -- are something of an afterthought. lconf needs rewritten with
2831 # these as core concepts. so this is a pretty big hack.
2832 def process_update_record(db, update, lov):
2833 for rec in update.childNodes:
2834 if rec.nodeType != rec.ELEMENT_NODE:
2837 log("found "+rec.nodeName+" record in update version " +
2838 str(update.getAttribute('version')))
2840 lov_uuid = rec.getAttribute('lov_uuidref')
2841 ost_uuid = rec.getAttribute('ost_uuidref')
2842 index = rec.getAttribute('index')
2843 gen = rec.getAttribute('generation')
2845 if not lov_uuid or not ost_uuid or not index or not gen:
2846 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
2849 tmplov = db.lookup(lov_uuid)
2851 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
2852 lov_name = tmplov.getName()
2854 lov_name = lov.osc.name
2856 # ------------------------------------------------------------- add
2857 if rec.nodeName == 'add':
2859 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2862 osc = magic_get_osc(db, rec, lov)
2865 # Only ignore connect failures with --force, which
2866 # isn't implemented here yet.
2867 osc.prepare(ignore_connect_failure=0)
2868 except CommandError, e:
2869 print "Error preparing OSC %s\n" % osc.uuid
2872 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2874 # ------------------------------------------------------ deactivate
2875 elif rec.nodeName == 'deactivate':
2879 osc = magic_get_osc(db, rec, lov)
2883 except CommandError, e:
2884 print "Error deactivating OSC %s\n" % osc.uuid
2887 # ---------------------------------------------------------- delete
2888 elif rec.nodeName == 'delete':
2892 osc = magic_get_osc(db, rec, lov)
2898 except CommandError, e:
2899 print "Error cleaning up OSC %s\n" % osc.uuid
2902 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2904 def process_updates(db, log_device, log_name, lov = None):
2905 updates = db.root_node.getElementsByTagName('update')
2907 if not u.childNodes:
2908 log("ignoring empty update record (version " +
2909 str(u.getAttribute('version')) + ")")
2912 version = u.getAttribute('version')
2913 real_name = "%s-%s" % (log_name, version)
2914 lctl.clear_log(log_device, real_name)
2915 lctl.record(log_device, real_name)
2917 process_update_record(db, u, lov)
2921 def doWriteconf(services):
2925 if s[1].get_class() == 'mdsdev':
2926 n = newService(s[1])
2929 def doSetup(services):
2934 n = newService(s[1])
2936 slist.append((n.level, n))
2939 nl = n[1].correct_level(n[0])
2940 nlist.append((nl, n[1]))
2945 def doModules(services):
2949 n = newService(s[1])
2952 def doCleanup(services):
2957 n = newService(s[1])
2959 slist.append((n.level, n))
2962 nl = n[1].correct_level(n[0])
2963 nlist.append((nl, n[1]))
2967 if n[1].safe_to_clean():
2970 def doUnloadModules(services):
2975 n = newService(s[1])
2976 if n.safe_to_clean_modules():
2981 def doHost(lustreDB, hosts):
2982 global is_router, local_node_name
2985 node_db = lustreDB.lookup_name(h, 'node')
2989 panic('No host entry found.')
2991 local_node_name = node_db.get_val('name', 0)
2992 is_router = node_db.get_val_int('router', 0)
2993 lustre_upcall = node_db.get_val('lustreUpcall', '')
2994 portals_upcall = node_db.get_val('portalsUpcall', '')
2995 timeout = node_db.get_val_int('timeout', 0)
2996 ptldebug = node_db.get_val('ptldebug', '')
2997 subsystem = node_db.get_val('subsystem', '')
2999 find_local_clusters(node_db)
3001 find_local_routes(lustreDB)
3003 # Two step process: (1) load modules, (2) setup lustre
3004 # if not cleaning, load modules first.
3005 prof_list = node_db.get_refs('profile')
3007 if config.write_conf:
3009 for_each_profile(node_db, prof_list, doModules)
3011 for_each_profile(node_db, prof_list, doWriteconf)
3012 for_each_profile(node_db, prof_list, doUnloadModules)
3014 elif config.recover:
3015 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3016 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3017 "--client_uuid <UUID> --conn_uuid <UUID>")
3018 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3020 elif config.cleanup:
3022 # the command line can override this value
3024 # ugly hack, only need to run lctl commands for --dump
3025 if config.lctl_dump or config.record:
3026 for_each_profile(node_db, prof_list, doCleanup)
3029 sys_set_timeout(timeout)
3030 sys_set_ptldebug(ptldebug)
3031 sys_set_subsystem(subsystem)
3032 sys_set_lustre_upcall(lustre_upcall)
3033 sys_set_portals_upcall(portals_upcall)
3035 for_each_profile(node_db, prof_list, doCleanup)
3036 for_each_profile(node_db, prof_list, doUnloadModules)
3040 # ugly hack, only need to run lctl commands for --dump
3041 if config.lctl_dump or config.record:
3042 sys_set_timeout(timeout)
3043 sys_set_lustre_upcall(lustre_upcall)
3044 for_each_profile(node_db, prof_list, doSetup)
3048 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3049 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3051 for_each_profile(node_db, prof_list, doModules)
3053 sys_set_debug_path()
3054 sys_set_ptldebug(ptldebug)
3055 sys_set_subsystem(subsystem)
3056 script = config.gdb_script
3057 run(lctl.lctl, ' modules >', script)
3059 log ("The GDB module script is in", script)
3060 # pause, so user has time to break and
3063 sys_set_timeout(timeout)
3064 sys_set_lustre_upcall(lustre_upcall)
3065 sys_set_portals_upcall(portals_upcall)
3067 for_each_profile(node_db, prof_list, doSetup)
3070 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3071 tgt = lustreDB.lookup(tgt_uuid)
3073 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3074 new_uuid = get_active_target(tgt)
3076 raise Lustre.LconfError("doRecovery: no active target found for: " +
3078 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3080 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3082 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3084 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3087 lctl.disconnect(oldnet)
3088 except CommandError, e:
3089 log("recover: disconnect", nid_uuid, "failed: ")
3094 except CommandError, e:
3095 log("recover: connect failed")
3098 lctl.recover(client_uuid, net.nid_uuid)
3101 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3102 base = os.path.dirname(cmd)
3103 if development_mode():
3104 if not config.lustre:
3105 debug('using objdir module paths')
3106 config.lustre = (os.path.join(base, ".."))
3107 # normalize the portals dir, using command line arg if set
3109 portals_dir = config.portals
3110 dir = os.path.join(config.lustre, portals_dir)
3111 config.portals = dir
3112 debug('config.portals', config.portals)
3113 elif config.lustre and config.portals:
3115 # if --lustre and --portals, normalize portals
3116 # can ignore POTRALS_DIR here, since it is probly useless here
3117 config.portals = os.path.join(config.lustre, config.portals)
3118 debug('config.portals B', config.portals)
3120 def sysctl(path, val):
3121 debug("+ sysctl", path, val)
3125 fp = open(os.path.join('/proc/sys', path), 'w')
3132 def sys_set_debug_path():
3133 sysctl('portals/debug_path', config.debug_path)
3135 def sys_set_lustre_upcall(upcall):
3136 # the command overrides the value in the node config
3137 if config.lustre_upcall:
3138 upcall = config.lustre_upcall
3140 upcall = config.upcall
3142 lctl.set_lustre_upcall(upcall)
3144 def sys_set_portals_upcall(upcall):
3145 # the command overrides the value in the node config
3146 if config.portals_upcall:
3147 upcall = config.portals_upcall
3149 upcall = config.upcall
3151 sysctl('portals/upcall', upcall)
3153 def sys_set_timeout(timeout):
3154 # the command overrides the value in the node config
3155 if config.timeout and config.timeout > 0:
3156 timeout = config.timeout
3157 if timeout != None and timeout > 0:
3158 lctl.set_timeout(timeout)
3160 def sys_tweak_socknal ():
3161 if config.single_socket:
3162 sysctl("socknal/typed", 0)
3164 def sys_optimize_elan ():
3165 procfiles = ["/proc/elan/config/eventint_punt_loops",
3166 "/proc/qsnet/elan3/config/eventint_punt_loops",
3167 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3169 if os.access(p, os.R_OK):
3170 run ("echo 1 > " + p)
3172 def sys_set_ptldebug(ptldebug):
3174 ptldebug = config.ptldebug
3177 val = eval(ptldebug, ptldebug_names)
3178 val = "0x%x" % (val)
3179 sysctl('portals/debug', val)
3180 except NameError, e:
3183 def sys_set_subsystem(subsystem):
3184 if config.subsystem:
3185 subsystem = config.subsystem
3188 val = eval(subsystem, subsystem_names)
3189 val = "0x%x" % (val)
3190 sysctl('portals/subsystem_debug', val)
3191 except NameError, e:
3194 def sys_set_netmem_max(path, max):
3195 debug("setting", path, "to at least", max)
3203 fp = open(path, 'w')
3204 fp.write('%d\n' %(max))
3208 def sys_make_devices():
3209 if not os.access('/dev/portals', os.R_OK):
3210 run('mknod /dev/portals c 10 240')
3211 if not os.access('/dev/obd', os.R_OK):
3212 run('mknod /dev/obd c 10 241')
3215 # Add dir to the global PATH, if not already there.
3216 def add_to_path(new_dir):
3217 syspath = string.split(os.environ['PATH'], ':')
3218 if new_dir in syspath:
3220 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3222 def default_debug_path():
3223 path = '/tmp/lustre-log'
3224 if os.path.isdir('/r'):
3229 def default_gdb_script():
3230 script = '/tmp/ogdb'
3231 if os.path.isdir('/r'):
3232 return '/r' + script
3237 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3238 # ensure basic elements are in the system path
3239 def sanitise_path():
3240 for dir in DEFAULT_PATH:
3243 # global hack for the --select handling
3245 def init_select(args):
3246 # args = [service=nodeA,service2=nodeB service3=nodeC]
3249 list = string.split(arg, ',')
3251 srv, node = string.split(entry, '=')
3252 tgt_select[srv] = node
3254 def get_select(srv):
3255 if tgt_select.has_key(srv):
3256 return tgt_select[srv]
3260 FLAG = Lustre.Options.FLAG
3261 PARAM = Lustre.Options.PARAM
3262 INTPARAM = Lustre.Options.INTPARAM
3263 PARAMLIST = Lustre.Options.PARAMLIST
3265 ('verbose,v', "Print system commands as they are run"),
3266 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3267 ('config', "Cluster config name used for LDAP query", PARAM),
3268 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3269 ('node', "Load config for <nodename>", PARAM),
3270 ('cleanup,d', "Cleans up config. (Shutdown)"),
3271 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3273 ('single_socket', "socknal option: only use one socket instead of bundle",
3275 ('failover',"""Used to shut down without saving state.
3276 This will allow this node to "give up" a service to a
3277 another node for failover purposes. This will not
3278 be a clean shutdown.""",
3280 ('gdb', """Prints message after creating gdb module script
3281 and sleeps for 5 seconds."""),
3282 ('noexec,n', """Prints the commands and steps that will be run for a
3283 config without executing them. This can used to check if a
3284 config file is doing what it should be doing"""),
3285 ('nomod', "Skip load/unload module step."),
3286 ('nosetup', "Skip device setup/cleanup step."),
3287 ('reformat', "Reformat all devices (without question)"),
3288 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3289 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3290 ('clientoptions', "Additional options for Lustre", PARAM),
3291 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3293 ('write_conf', "Save all the client config information on mds."),
3294 ('record', "Write config information on mds."),
3295 ('record_log', "Name of config record log.", PARAM),
3296 ('record_device', "MDS device name that will record the config commands",
3298 ('minlevel', "Minimum level of services to configure/cleanup",
3300 ('maxlevel', """Maximum level of services to configure/cleanup
3301 Levels are aproximatly like:
3306 70 - mountpoint, echo_client, osc, mdc, lov""",
3308 ('lustre', """Base directory of lustre sources. This parameter will
3309 cause lconf to load modules from a source tree.""", PARAM),
3310 ('portals', """Portals source directory. If this is a relative path,
3311 then it is assumed to be relative to lustre. """, PARAM),
3312 ('timeout', "Set recovery timeout", INTPARAM),
3313 ('upcall', "Set both portals and lustre upcall script", PARAM),
3314 ('lustre_upcall', "Set lustre upcall script", PARAM),
3315 ('portals_upcall', "Set portals upcall script", PARAM),
3316 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3317 ('ptldebug', "Set the portals debug level", PARAM),
3318 ('subsystem', "Set the portals debug subsystem", PARAM),
3319 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3320 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3321 # Client recovery options
3322 ('recover', "Recover a device"),
3323 ('group', "The group of devices to configure or cleanup", PARAM),
3324 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3325 ('client_uuid', "The failed client (required for recovery)", PARAM),
3326 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3328 ('inactive', """The name of an inactive service, to be ignored during
3329 mounting (currently OST-only). Can be repeated.""",
3334 global lctl, config, toplustreDB, CONFIG_FILE
3336 # in the upcall this is set to SIG_IGN
3337 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3339 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3341 config, args = cl.parse(sys.argv[1:])
3342 except Lustre.OptionError, e:
3346 setupModulePath(sys.argv[0])
3348 host = socket.gethostname()
3350 # the PRNG is normally seeded with time(), which is not so good for starting
3351 # time-synchronized clusters
3352 input = open('/dev/urandom', 'r')
3354 print 'Unable to open /dev/urandom!'
3356 seed = input.read(32)
3362 init_select(config.select)
3365 # allow config to be fetched via HTTP, but only with python2
3366 if sys.version[0] != '1' and args[0].startswith('http://'):
3369 config_file = urllib2.urlopen(args[0])
3370 except (urllib2.URLError, socket.error), err:
3371 if hasattr(err, 'args'):
3373 print "Could not access '%s': %s" %(args[0], err)
3375 elif not os.access(args[0], os.R_OK):
3376 print 'File not found or readable:', args[0]
3380 config_file = open(args[0], 'r')
3382 dom = xml.dom.minidom.parse(config_file)
3384 panic("%s does not appear to be a config file." % (args[0]))
3385 sys.exit(1) # make sure to die here, even in debug mode.
3387 CONFIG_FILE = args[0]
3388 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3389 if not config.config:
3390 config.config = os.path.basename(args[0])# use full path?
3391 if config.config[-4:] == '.xml':
3392 config.config = config.config[:-4]
3393 elif config.ldapurl:
3394 if not config.config:
3395 panic("--ldapurl requires --config name")
3396 dn = "config=%s,fs=lustre" % (config.config)
3397 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3398 elif config.ptldebug or config.subsystem:
3399 sys_set_ptldebug(None)
3400 sys_set_subsystem(None)
3403 print 'Missing config file or ldap URL.'
3404 print 'see lconf --help for command summary'
3407 toplustreDB = lustreDB
3409 ver = lustreDB.get_version()
3411 panic("No version found in config data, please recreate.")
3412 if ver != Lustre.CONFIG_VERSION:
3413 panic("Config version", ver, "does not match lconf version",
3414 Lustre.CONFIG_VERSION)
3418 node_list.append(config.node)
3421 node_list.append(host)
3422 node_list.append('localhost')
3424 debug("configuring for host: ", node_list)
3427 config.debug_path = config.debug_path + '-' + host
3428 config.gdb_script = config.gdb_script + '-' + host
3430 lctl = LCTLInterface('lctl')
3432 if config.lctl_dump:
3433 lctl.use_save_file(config.lctl_dump)
3436 if not (config.record_device and config.record_log):
3437 panic("When recording, both --record_log and --record_device must be specified.")
3438 lctl.clear_log(config.record_device, config.record_log)
3439 lctl.record(config.record_device, config.record_log)
3441 doHost(lustreDB, node_list)
3443 if not config.record:
3448 process_updates(db, config.record_device, config.record_log)
3450 if __name__ == "__main__":
3453 except Lustre.LconfError, e:
3455 # traceback.print_exc(file=sys.stdout)
3457 except CommandError, e:
3461 if first_cleanup_error:
3462 sys.exit(first_cleanup_error)