3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
106 "socknal" : (1 << 11),
107 "qswnal" : (1 << 12),
108 "pinger" : (1 << 13),
109 "filter" : (1 << 14),
115 "ptlrouter" : (1 << 20),
122 first_cleanup_error = 0
123 def cleanup_error(rc):
124 global first_cleanup_error
125 if not first_cleanup_error:
126 first_cleanup_error = rc
128 # ============================================================
129 # debugging and error funcs
131 def fixme(msg = "this feature"):
132 raise Lustre.LconfError, msg + ' not implemented yet.'
135 msg = string.join(map(str,args))
136 if not config.noexec:
137 raise Lustre.LconfError(msg)
142 msg = string.join(map(str,args))
147 print string.strip(s)
151 msg = string.join(map(str,args))
154 # ack, python's builtin int() does not support '0x123' syntax.
155 # eval can do it, although what a hack!
159 return eval(s, {}, {})
162 except SyntaxError, e:
163 raise ValueError("not a number")
165 raise ValueError("not a number")
167 # ============================================================
168 # locally defined exceptions
169 class CommandError (exceptions.Exception):
170 def __init__(self, cmd_name, cmd_err, rc=None):
171 self.cmd_name = cmd_name
172 self.cmd_err = cmd_err
177 if type(self.cmd_err) == types.StringType:
179 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
181 print "! %s: %s" % (self.cmd_name, self.cmd_err)
182 elif type(self.cmd_err) == types.ListType:
184 print "! %s (error %d):" % (self.cmd_name, self.rc)
186 print "! %s:" % (self.cmd_name)
187 for s in self.cmd_err:
188 print "> %s" %(string.strip(s))
193 # ============================================================
194 # handle daemons, like the acceptor
196 """ Manage starting and stopping a daemon. Assumes daemon manages
197 it's own pid file. """
199 def __init__(self, cmd):
205 log(self.command, "already running.")
207 self.path = find_prog(self.command)
209 panic(self.command, "not found.")
210 ret, out = runcmd(self.path +' '+ self.command_line())
212 raise CommandError(self.path, out, ret)
216 pid = self.read_pidfile()
218 log ("killing process", pid)
220 #time.sleep(1) # let daemon die
222 log("unable to kill", self.command, e)
224 log("unable to kill", self.command)
227 pid = self.read_pidfile()
237 def read_pidfile(self):
239 fp = open(self.pidfile(), 'r')
246 def clean_pidfile(self):
247 """ Remove a stale pidfile """
248 log("removing stale pidfile:", self.pidfile())
250 os.unlink(self.pidfile())
252 log(self.pidfile(), e)
254 class AcceptorHandler(DaemonHandler):
255 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
256 DaemonHandler.__init__(self, "acceptor")
259 self.send_mem = send_mem
260 self.recv_mem = recv_mem
263 self.flags = self.flags + ' -i'
266 return "/var/run/%s-%d.pid" % (self.command, self.port)
268 def command_line(self):
269 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
273 # start the acceptors
275 if config.lctl_dump or config.record:
277 for port in acceptors.keys():
278 daemon = acceptors[port]
279 if not daemon.running():
282 def run_one_acceptor(port):
283 if config.lctl_dump or config.record:
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
287 if not daemon.running():
290 panic("run_one_acceptor: No acceptor defined for port:", port)
292 def stop_acceptor(port):
293 if acceptors.has_key(port):
294 daemon = acceptors[port]
299 # ============================================================
300 # handle lctl interface
303 Manage communication with lctl
306 def __init__(self, cmd):
308 Initialize close by finding the lctl binary.
310 self.lctl = find_prog(cmd)
312 self.record_device = ''
315 debug('! lctl not found')
318 raise CommandError('lctl', "unable to find lctl binary.")
320 def use_save_file(self, file):
321 self.save_file = file
323 def record(self, dev_name, logname):
324 log("Recording log", logname, "on", dev_name)
325 self.record_device = dev_name
326 self.record_log = logname
328 def end_record(self):
329 log("End recording log", self.record_log, "on", self.record_device)
330 self.record_device = None
331 self.record_log = None
333 def set_nonblock(self, fd):
334 fl = fcntl.fcntl(fd, F_GETFL)
335 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
340 the cmds are written to stdin of lctl
341 lctl doesn't return errors when run in script mode, so
343 should modify command line to accept multiple commands, or
344 create complex command line options
348 cmds = '\n dump ' + self.save_file + '\n' + cmds
349 elif self.record_device:
353 %s""" % (self.record_device, self.record_log, cmds)
355 debug("+", cmd_line, cmds)
356 if config.noexec: return (0, [])
358 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
359 child.tochild.write(cmds + "\n")
360 child.tochild.close()
361 # print "LCTL:", cmds
363 # From "Python Cookbook" from O'Reilly
364 outfile = child.fromchild
365 outfd = outfile.fileno()
366 self.set_nonblock(outfd)
367 errfile = child.childerr
368 errfd = errfile.fileno()
369 self.set_nonblock(errfd)
371 outdata = errdata = ''
374 ready = select.select([outfd,errfd],[],[]) # Wait for input
375 if outfd in ready[0]:
376 outchunk = outfile.read()
377 if outchunk == '': outeof = 1
378 outdata = outdata + outchunk
379 if errfd in ready[0]:
380 errchunk = errfile.read()
381 if errchunk == '': erreof = 1
382 errdata = errdata + errchunk
383 if outeof and erreof: break
384 # end of "borrowed" code
387 if os.WIFEXITED(ret):
388 rc = os.WEXITSTATUS(ret)
391 if rc or len(errdata):
392 raise CommandError(self.lctl, errdata, rc)
395 def runcmd(self, *args):
397 run lctl using the command line
399 cmd = string.join(map(str,args))
400 debug("+", self.lctl, cmd)
401 rc, out = run(self.lctl, cmd)
403 raise CommandError(self.lctl, out, rc)
407 def clear_log(self, dev, log):
408 """ clear an existing log """
413 quit """ % (dev, log)
416 def network(self, net, nid):
421 quit """ % (net, nid)
424 # create a new connection
425 def add_uuid(self, net_type, uuid, nid):
426 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
429 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
431 if net_type in ('tcp',) and not config.lctl_dump:
436 add_autoconn %s %s %d %s
440 nid, hostaddr, port, flags )
443 def connect(self, srv):
444 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
445 if srv.net_type in ('tcp',) and not config.lctl_dump:
449 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
450 srv.nid, srv.hostaddr, srv.port, flags)
453 def recover(self, dev_name, new_conn):
456 recover %s""" %(dev_name, new_conn)
459 # add a route to a range
460 def add_route(self, net, gw, lo, hi):
468 except CommandError, e:
472 def del_route(self, net, gw, lo, hi):
477 quit """ % (net, gw, lo, hi)
480 # add a route to a host
481 def add_route_host(self, net, uuid, gw, tgt):
482 self.add_uuid(net, uuid, tgt)
490 except CommandError, e:
494 # add a route to a range
495 def del_route_host(self, net, uuid, gw, tgt):
501 quit """ % (net, gw, tgt)
505 def del_autoconn(self, net_type, nid, hostaddr):
506 if net_type in ('tcp',) and not config.lctl_dump:
515 # disconnect one connection
516 def disconnect(self, srv):
517 self.del_uuid(srv.nid_uuid)
518 if srv.net_type in ('tcp',) and not config.lctl_dump:
519 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
521 def del_uuid(self, uuid):
529 def disconnectAll(self, net):
537 def attach(self, type, name, uuid):
540 quit""" % (type, name, uuid)
543 def setup(self, name, setup = ""):
547 quit""" % (name, setup)
551 # create a new device with lctl
552 def newdev(self, type, name, uuid, setup = ""):
553 self.attach(type, name, uuid);
555 self.setup(name, setup)
556 except CommandError, e:
557 self.cleanup(name, uuid, 0)
562 def cleanup(self, name, uuid, force, failover = 0):
563 if failover: force = 1
569 quit""" % (name, ('', 'force')[force],
570 ('', 'failover')[failover])
574 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
575 stripe_sz, stripe_off, pattern):
578 lov_setup %s %d %d %d %s
579 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
582 # add an OBD to a LOV
583 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
585 lov_modify_tgts add %s %s %s %s
586 quit""" % (name, obd_uuid, index, gen)
590 def lmv_setup(self, name, uuid, desc_uuid, devlist):
594 quit""" % (name, uuid, desc_uuid, devlist)
597 # delete an OBD from a LOV
598 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
600 lov_modify_tgts del %s %s %s %s
601 quit""" % (name, obd_uuid, index, gen)
605 def deactivate(self, name):
613 def dump(self, dump_file):
616 quit""" % (dump_file)
619 # get list of devices
620 def device_list(self):
621 devices = '/proc/fs/lustre/devices'
623 if os.access(devices, os.R_OK):
625 fp = open(devices, 'r')
633 def lustre_version(self):
634 rc, out = self.runcmd('version')
638 def mount_option(self, profile, osc, mdc):
640 mount_option %s %s %s
641 quit""" % (profile, osc, mdc)
644 # delete mount options
645 def del_mount_option(self, profile):
651 def set_timeout(self, timeout):
657 def set_lustre_upcall(self, upcall):
662 # ============================================================
663 # Various system-level functions
664 # (ideally moved to their own module)
666 # Run a command and return the output and status.
667 # stderr is sent to /dev/null, could use popen3 to
668 # save it if necessary
671 if config.noexec: return (0, [])
672 f = os.popen(cmd + ' 2>&1')
682 cmd = string.join(map(str,args))
685 # Run a command in the background.
686 def run_daemon(*args):
687 cmd = string.join(map(str,args))
689 if config.noexec: return 0
690 f = os.popen(cmd + ' 2>&1')
698 # Determine full path to use for an external command
699 # searches dirname(argv[0]) first, then PATH
701 syspath = string.split(os.environ['PATH'], ':')
702 cmdpath = os.path.dirname(sys.argv[0])
703 syspath.insert(0, cmdpath);
705 syspath.insert(0, os.path.join(config.portals, 'utils/'))
707 prog = os.path.join(d,cmd)
708 if os.access(prog, os.X_OK):
712 # Recursively look for file starting at base dir
713 def do_find_file(base, mod):
714 fullname = os.path.join(base, mod)
715 if os.access(fullname, os.R_OK):
717 for d in os.listdir(base):
718 dir = os.path.join(base,d)
719 if os.path.isdir(dir):
720 module = do_find_file(dir, mod)
724 def find_module(src_dir, dev_dir, modname):
725 modbase = src_dir +'/'+ dev_dir +'/'+ modname
726 for modext in '.ko', '.o':
727 module = modbase + modext
729 if os.access(module, os.R_OK):
735 # is the path a block device?
742 return stat.S_ISBLK(s[stat.ST_MODE])
744 # build fs according to type
746 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
752 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
754 # devsize is in 1k, and fs block count is in 4k
755 block_cnt = devsize/4
757 if fstype in ('ext3', 'extN', 'ldiskfs'):
758 # ext3 journal size is in megabytes
761 if not is_block(dev):
762 ret, out = runcmd("ls -l %s" %dev)
763 devsize = int(string.split(out[0])[4]) / 1024
765 ret, out = runcmd("sfdisk -s %s" %dev)
766 devsize = int(out[0])
767 if devsize > 1024 * 1024:
768 jsize = ((devsize / 102400) * 4)
771 if jsize: jopt = "-J size=%d" %(jsize,)
772 if isize: iopt = "-I %d" %(isize,)
773 mkfs = 'mkfs.ext2 -j -b 4096 '
774 if not isblock or config.force:
776 elif fstype == 'reiserfs':
777 # reiserfs journal size is in blocks
778 if jsize: jopt = "--journal_size %d" %(jsize,)
779 mkfs = 'mkreiserfs -ff'
781 panic('unsupported fs type: ', fstype)
783 if config.mkfsoptions != None:
784 mkfs = mkfs + ' ' + config.mkfsoptions
785 if mkfsoptions != None:
786 mkfs = mkfs + ' ' + mkfsoptions
787 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
789 panic("Unable to build fs:", dev, string.join(out))
790 # enable hash tree indexing on fsswe
791 if fstype in ('ext3', 'extN', 'ldiskfs'):
792 htree = 'echo "feature FEATURE_C5" | debugfs -w'
793 (ret, out) = run (htree, dev)
795 panic("Unable to enable htree:", dev)
797 # some systems use /dev/loopN, some /dev/loop/N
801 if not os.access(loop + str(0), os.R_OK):
803 if not os.access(loop + str(0), os.R_OK):
804 panic ("can't access loop devices")
807 # find loop device assigned to the file
808 def find_assigned_loop(file):
810 for n in xrange(0, MAX_LOOP_DEVICES):
812 if os.access(dev, os.R_OK):
813 (stat, out) = run('losetup', dev)
814 if out and stat == 0:
815 m = re.search(r'\((.*)\)', out[0])
816 if m and file == m.group(1):
822 # create file if necessary and assign the first free loop device
823 def init_loop(file, size, fstype, journal_size, inode_size,
824 mkfsoptions, reformat, autoformat, backfstype, backfile):
827 realfstype = backfstype
828 if is_block(backfile):
829 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
830 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
836 dev = find_assigned_loop(realfile)
838 print 'WARNING file:', realfile, 'already mapped to', dev
841 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
843 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
844 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
846 panic("Unable to create backing store:", realfile)
848 mkfs(realfile, size, realfstype, journal_size, inode_size,
849 mkfsoptions, isblock=0)
852 # find next free loop
853 for n in xrange(0, MAX_LOOP_DEVICES):
855 if os.access(dev, os.R_OK):
856 (stat, out) = run('losetup', dev)
858 run('losetup', dev, realfile)
861 print "out of loop devices"
863 print "out of loop devices"
866 # undo loop assignment
867 def clean_loop(file):
868 dev = find_assigned_loop(file)
870 ret, out = run('losetup -d', dev)
872 log('unable to clean loop device:', dev, 'for file:', file)
875 # determine if dev is formatted as a <fstype> filesystem
876 def need_format(fstype, dev):
877 # FIXME don't know how to implement this
880 # initialize a block device if needed
881 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
882 inode_size, mkfsoptions, backfstype, backdev):
886 if fstype == 'smfs' or not is_block(dev):
887 dev = init_loop(dev, size, fstype, journal_size, inode_size,
888 mkfsoptions, reformat, autoformat, backfstype, backdev)
889 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
890 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
893 # panic("device:", dev,
894 # "not prepared, and autoformat is not set.\n",
895 # "Rerun with --reformat option to format ALL filesystems")
900 """lookup IP address for an interface"""
901 rc, out = run("/sbin/ifconfig", iface)
904 addr = string.split(out[1])[1]
905 ip = string.split(addr, ':')[1]
908 def def_mount_options(fstype, target):
909 """returns deafult mount options for passed fstype and target (mds, ost)"""
910 if fstype == 'ext3' or fstype == 'ldiskfs':
911 mountfsoptions = "errors=remount-ro"
912 if target == 'ost' and sys_get_branch() == '2.4':
913 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
914 return mountfsoptions
917 def sys_get_elan_position_file():
918 procfiles = ["/proc/elan/device0/position",
919 "/proc/qsnet/elan4/device0/position",
920 "/proc/qsnet/elan3/device0/position"]
922 if os.access(p, os.R_OK):
926 def sys_get_local_nid(net_type, wildcard, cluster_id):
927 """Return the local nid."""
929 if sys_get_elan_position_file():
930 local = sys_get_local_address('elan', '*', cluster_id)
932 local = sys_get_local_address(net_type, wildcard, cluster_id)
935 def sys_get_local_address(net_type, wildcard, cluster_id):
936 """Return the local address for the network type."""
938 if net_type in ('tcp',):
940 iface, star = string.split(wildcard, ':')
941 local = if2addr(iface)
943 panic ("unable to determine ip for:", wildcard)
945 host = socket.gethostname()
946 local = socket.gethostbyname(host)
947 elif net_type == 'elan':
948 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
949 f = sys_get_elan_position_file()
951 panic ("unable to determine local Elan ID")
954 lines = fp.readlines()
962 nid = my_int(cluster_id) + my_int(elan_id)
964 except ValueError, e:
968 elif net_type == 'gm':
969 fixme("automatic local address for GM")
973 def sys_get_branch():
974 """Returns kernel release"""
976 fp = open('/proc/sys/kernel/osrelease')
977 lines = fp.readlines()
981 version = string.split(l)
982 a = string.split(version[0], '.')
983 return a[0] + '.' + a[1]
989 def mod_loaded(modname):
990 """Check if a module is already loaded. Look in /proc/modules for it."""
992 fp = open('/proc/modules')
993 lines = fp.readlines()
995 # please forgive my tired fingers for this one
996 ret = filter(lambda word, mod=modname: word == mod,
997 map(lambda line: string.split(line)[0], lines))
1002 # XXX: instead of device_list, ask for $name and see what we get
1003 def is_prepared(name):
1004 """Return true if a device exists for the name"""
1005 if config.lctl_dump:
1007 if (config.noexec or config.record) and config.cleanup:
1010 # expect this format:
1011 # 1 UP ldlm ldlm ldlm_UUID 2
1012 out = lctl.device_list()
1014 if name == string.split(s)[3]:
1016 except CommandError, e:
1020 def is_network_prepared():
1021 """If the any device exists, then assume that all networking
1022 has been configured"""
1023 out = lctl.device_list()
1026 def fs_is_mounted(path):
1027 """Return true if path is a mounted lustre filesystem"""
1029 fp = open('/proc/mounts')
1030 lines = fp.readlines()
1034 if a[1] == path and a[2] == 'lustre_lite':
1042 """Manage kernel modules"""
1043 def __init__(self, lustre_dir, portals_dir):
1044 self.lustre_dir = lustre_dir
1045 self.portals_dir = portals_dir
1046 self.kmodule_list = []
1048 def add_portals_module(self, dev_dir, modname):
1049 """Append a module to list of modules to load."""
1050 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1052 def add_lustre_module(self, dev_dir, modname):
1053 """Append a module to list of modules to load."""
1054 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1056 def load_module(self):
1057 """Load all the modules in the list in the order they appear."""
1058 for src_dir, dev_dir, mod in self.kmodule_list:
1059 if mod_loaded(mod) and not config.noexec:
1061 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1063 module = find_module(src_dir, dev_dir, mod)
1065 panic('module not found:', mod)
1066 (rc, out) = run('/sbin/insmod', module)
1068 raise CommandError('insmod', out, rc)
1070 (rc, out) = run('/sbin/modprobe', mod)
1072 raise CommandError('modprobe', out, rc)
1074 def cleanup_module(self):
1075 """Unload the modules in the list in reverse order."""
1076 rev = self.kmodule_list
1078 for src_dir, dev_dir, mod in rev:
1079 if not mod_loaded(mod) and not config.noexec:
1082 if mod == 'portals' and config.dump:
1083 lctl.dump(config.dump)
1084 log('unloading module:', mod)
1085 (rc, out) = run('/sbin/rmmod', mod)
1087 log('! unable to unload module:', mod)
1090 # ============================================================
1091 # Classes to prepare and cleanup the various objects
1094 """ Base class for the rest of the modules. The default cleanup method is
1095 defined here, as well as some utilitiy funcs.
1097 def __init__(self, module_name, db):
1099 self.module_name = module_name
1100 self.name = self.db.getName()
1101 self.uuid = self.db.getUUID()
1104 self.kmod = kmod(config.lustre, config.portals)
1106 def info(self, *args):
1107 msg = string.join(map(str,args))
1108 print self.module_name + ":", self.name, self.uuid, msg
1111 """ default cleanup, used for most modules """
1114 lctl.cleanup(self.name, self.uuid, config.force)
1115 except CommandError, e:
1116 log(self.module_name, "cleanup failed: ", self.name)
1120 def add_portals_module(self, dev_dir, modname):
1121 """Append a module to list of modules to load."""
1122 self.kmod.add_portals_module(dev_dir, modname)
1124 def add_lustre_module(self, dev_dir, modname):
1125 """Append a module to list of modules to load."""
1126 self.kmod.add_lustre_module(dev_dir, modname)
1128 def load_module(self):
1129 """Load all the modules in the list in the order they appear."""
1130 self.kmod.load_module()
1132 def cleanup_module(self):
1133 """Unload the modules in the list in reverse order."""
1134 if self.safe_to_clean():
1135 self.kmod.cleanup_module()
1137 def safe_to_clean(self):
1140 def safe_to_clean_modules(self):
1141 return self.safe_to_clean()
1143 class Network(Module):
1144 def __init__(self,db):
1145 Module.__init__(self, 'NETWORK', db)
1146 self.net_type = self.db.get_val('nettype')
1147 self.nid = self.db.get_val('nid', '*')
1148 self.cluster_id = self.db.get_val('clusterid', "0")
1149 self.port = self.db.get_val_int('port', 0)
1150 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1151 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1152 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1155 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1157 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1158 self.generic_nid = 1
1159 debug("nid:", self.nid)
1161 self.generic_nid = 0
1163 self.nid_uuid = self.nid_to_uuid(self.nid)
1165 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1166 if '*' in self.hostaddr:
1167 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1168 if not self.hostaddr:
1169 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1170 debug("hostaddr:", self.hostaddr)
1172 self.add_portals_module("libcfs", 'libcfs')
1173 self.add_portals_module("portals", 'portals')
1174 if node_needs_router():
1175 self.add_portals_module("router", 'kptlrouter')
1176 if self.net_type == 'tcp':
1177 self.add_portals_module("knals/socknal", 'ksocknal')
1178 if self.net_type == 'elan':
1179 self.add_portals_module("knals/qswnal", 'kqswnal')
1180 if self.net_type == 'gm':
1181 self.add_portals_module("knals/gmnal", 'kgmnal')
1183 def nid_to_uuid(self, nid):
1184 return "NID_%s_UUID" %(nid,)
1187 if not config.record and is_network_prepared():
1189 self.info(self.net_type, self.nid, self.port)
1190 if not (config.record and self.generic_nid):
1191 lctl.network(self.net_type, self.nid)
1192 if self.net_type == 'tcp':
1194 if self.net_type == 'elan':
1196 if self.port and node_is_router():
1197 run_one_acceptor(self.port)
1198 self.connect_peer_gateways()
1200 def connect_peer_gateways(self):
1201 for router in self.db.lookup_class('node'):
1202 if router.get_val_int('router', 0):
1203 for netuuid in router.get_networks():
1204 net = self.db.lookup(netuuid)
1206 if (gw.cluster_id == self.cluster_id and
1207 gw.net_type == self.net_type):
1208 if gw.nid != self.nid:
1211 def disconnect_peer_gateways(self):
1212 for router in self.db.lookup_class('node'):
1213 if router.get_val_int('router', 0):
1214 for netuuid in router.get_networks():
1215 net = self.db.lookup(netuuid)
1217 if (gw.cluster_id == self.cluster_id and
1218 gw.net_type == self.net_type):
1219 if gw.nid != self.nid:
1222 except CommandError, e:
1223 print "disconnect failed: ", self.name
1227 def safe_to_clean(self):
1228 return not is_network_prepared()
1231 self.info(self.net_type, self.nid, self.port)
1233 stop_acceptor(self.port)
1234 if node_is_router():
1235 self.disconnect_peer_gateways()
1237 def correct_level(self, level, op=None):
1240 class RouteTable(Module):
1241 def __init__(self,db):
1242 Module.__init__(self, 'ROUTES', db)
1244 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1246 # only setup connections for tcp NALs
1248 if not net_type in ('tcp',):
1251 # connect to target if route is to single node and this node is the gw
1252 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1253 if not local_cluster(net_type, tgt_cluster_id):
1254 panic("target", lo, " not on the local cluster")
1255 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1256 # connect to gateway if this node is not the gw
1257 elif (local_cluster(net_type, gw_cluster_id)
1258 and not local_interface(net_type, gw_cluster_id, gw)):
1259 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1264 panic("no server for nid", lo)
1267 return Network(srvdb)
1270 if not config.record and is_network_prepared():
1273 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1274 lctl.add_route(net_type, gw, lo, hi)
1275 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1279 def safe_to_clean(self):
1280 return not is_network_prepared()
1283 if is_network_prepared():
1284 # the network is still being used, don't clean it up
1286 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1287 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1290 lctl.disconnect(srv)
1291 except CommandError, e:
1292 print "disconnect failed: ", self.name
1297 lctl.del_route(net_type, gw, lo, hi)
1298 except CommandError, e:
1299 print "del_route failed: ", self.name
1303 class Management(Module):
1304 def __init__(self, db):
1305 Module.__init__(self, 'MGMT', db)
1306 self.add_lustre_module('lvfs', 'lvfs')
1307 self.add_lustre_module('obdclass', 'obdclass')
1308 self.add_lustre_module('ptlrpc', 'ptlrpc')
1309 self.add_lustre_module('mgmt', 'mgmt_svc')
1312 if not config.record and is_prepared(self.name):
1315 lctl.newdev("mgmt", self.name, self.uuid)
1317 def safe_to_clean(self):
1321 if is_prepared(self.name):
1322 Module.cleanup(self)
1324 def correct_level(self, level, op=None):
1327 # This is only needed to load the modules; the LDLM device
1328 # is now created automatically.
1330 def __init__(self,db):
1331 Module.__init__(self, 'LDLM', db)
1332 self.add_lustre_module('lvfs', 'lvfs')
1333 self.add_lustre_module('obdclass', 'obdclass')
1334 self.add_lustre_module('ptlrpc', 'ptlrpc')
1342 def correct_level(self, level, op=None):
1347 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1348 Module.__init__(self, 'LOV', db)
1349 if name_override != None:
1350 self.name = "lov_%s" % name_override
1351 self.add_lustre_module('lov', 'lov')
1352 self.mds_uuid = self.db.get_first_ref('mds')
1353 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1354 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1355 self.pattern = self.db.get_val_int('stripepattern', 0)
1356 self.devlist = self.db.get_lov_tgts('lov_tgt')
1357 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1359 self.desc_uuid = self.uuid
1360 self.uuid = generate_client_uuid(self.name)
1361 self.fs_name = fs_name
1363 self.config_only = 1
1365 self.config_only = None
1366 mds = self.db.lookup(self.mds_uuid)
1367 self.mds_name = mds.getName()
1368 for (obd_uuid, index, gen, active) in self.devlist:
1371 obd = self.db.lookup(obd_uuid)
1372 osc = get_osc(obd, self.uuid, fs_name)
1374 self.osclist.append((osc, index, gen, active))
1376 panic('osc not found:', obd_uuid)
1382 if not config.record and is_prepared(self.name):
1384 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1385 self.stripe_off, self.pattern, self.devlist,
1387 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1388 self.stripe_sz, self.stripe_off, self.pattern)
1389 for (osc, index, gen, active) in self.osclist:
1390 target_uuid = osc.target_uuid
1392 # Only ignore connect failures with --force, which
1393 # isn't implemented here yet.
1395 osc.prepare(ignore_connect_failure=0)
1396 except CommandError, e:
1397 print "Error preparing OSC %s\n" % osc.uuid
1399 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1402 for (osc, index, gen, active) in self.osclist:
1403 target_uuid = osc.target_uuid
1405 if is_prepared(self.name):
1406 Module.cleanup(self)
1407 if self.config_only:
1408 panic("Can't clean up config_only LOV ", self.name)
1410 def load_module(self):
1411 if self.config_only:
1412 panic("Can't load modules for config_only LOV ", self.name)
1413 for (osc, index, gen, active) in self.osclist:
1416 Module.load_module(self)
1418 def cleanup_module(self):
1419 if self.config_only:
1420 panic("Can't cleanup modules for config_only LOV ", self.name)
1421 Module.cleanup_module(self)
1422 for (osc, index, gen, active) in self.osclist:
1424 osc.cleanup_module()
1427 def correct_level(self, level, op=None):
1431 def __init__(self, db, uuid, fs_name, name_override = None):
1432 Module.__init__(self, 'LMV', db)
1433 if name_override != None:
1434 self.name = "lmv_%s" % name_override
1435 self.add_lustre_module('lmv', 'lmv')
1436 self.devlist = self.db.get_refs('mds')
1438 self.desc_uuid = self.uuid
1440 self.fs_name = fs_name
1441 for mds_uuid in self.devlist:
1442 mds = self.db.lookup(mds_uuid)
1444 panic("MDS not found!")
1445 mdc = MDC(mds, self.uuid, fs_name)
1447 self.mdclist.append(mdc)
1449 panic('mdc not found:', mds_uuid)
1452 if is_prepared(self.name):
1454 for mdc in self.mdclist:
1456 # Only ignore connect failures with --force, which
1457 # isn't implemented here yet.
1458 mdc.prepare(ignore_connect_failure=0)
1459 except CommandError, e:
1460 print "Error preparing LMV %s\n" % mdc.uuid
1462 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1463 string.join(self.devlist))
1466 for mdc in self.mdclist:
1468 if is_prepared(self.name):
1469 Module.cleanup(self)
1471 def load_module(self):
1472 for mdc in self.mdclist:
1475 Module.load_module(self)
1477 def cleanup_module(self):
1478 Module.cleanup_module(self)
1479 for mds in self.mdclist:
1480 mdc.cleanup_module()
1483 def correct_level(self, level, op=None):
1486 class MDSDEV(Module):
1487 def __init__(self,db):
1488 Module.__init__(self, 'MDSDEV', db)
1489 self.devpath = self.db.get_val('devpath','')
1490 self.backdevpath = self.db.get_val('backdevpath','')
1491 self.size = self.db.get_val_int('devsize', 0)
1492 self.journal_size = self.db.get_val_int('journalsize', 0)
1493 self.fstype = self.db.get_val('fstype', '')
1494 self.backfstype = self.db.get_val('backfstype', '')
1495 self.nspath = self.db.get_val('nspath', '')
1496 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1497 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1498 self.cachetype = self.db.get_val('cachetype', '')
1499 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1500 target_uuid = self.db.get_first_ref('target')
1501 mds = self.db.lookup(target_uuid)
1502 self.name = mds.getName()
1503 self.filesystem_uuids = mds.get_refs('filesystem')
1506 self.master_mds = ""
1507 if not self.filesystem_uuids:
1508 self.lmv_uuid = self.db.get_first_ref('lmv')
1509 if not self.lmv_uuid:
1510 panic("ALERT: can't find lvm uuid")
1512 self.lmv = self.db.lookup(self.lmv_uuid)
1514 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1515 self.master_mds = self.lmv_uuid
1516 # FIXME: if fstype not set, then determine based on kernel version
1517 self.format = self.db.get_val('autoformat', "no")
1518 if mds.get_val('failover', 0):
1519 self.failover_mds = 'f'
1521 self.failover_mds = 'n'
1522 active_uuid = get_active_target(mds)
1524 panic("No target device found:", target_uuid)
1525 if active_uuid == self.uuid:
1529 if self.active and config.group and config.group != mds.get_val('group'):
1532 self.inode_size = self.db.get_val_int('inodesize', 0)
1533 if self.inode_size == 0:
1534 # find the LOV for this MDS
1535 lovconfig_uuid = mds.get_first_ref('lovconfig')
1536 if not lovconfig_uuid:
1537 if not self.lmv_uuid:
1538 panic("No LOV found for lovconfig ", lovconfig.name)
1541 panic("No LMV initialized and not lovconfig_uuid found")
1543 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1544 lovconfig = self.lmv.lookup(lovconfig_uuid)
1545 lov_uuid = lovconfig.get_first_ref('lov')
1547 panic("No LOV found for lovconfig ", lovconfig.name)
1549 lovconfig = mds.lookup(lovconfig_uuid)
1550 lov_uuid = lovconfig.get_first_ref('lov')
1552 panic("No LOV found for lovconfig ", lovconfig.name)
1555 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1556 lovconfig = self.lmv.lookup(lovconfig_uuid)
1557 lov_uuid = lovconfig.get_first_ref('lov')
1559 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1561 # default stripe count controls default inode_size
1562 stripe_count = lov.stripe_cnt
1563 if stripe_count > 77:
1564 self.inode_size = 4096
1565 elif stripe_count > 35:
1566 self.inode_size = 2048
1567 elif stripe_count > 13:
1568 self.inode_size = 1024
1569 elif stripe_count > 3:
1570 self.inode_size = 512
1572 self.inode_size = 256
1574 self.target_dev_uuid = self.uuid
1575 self.uuid = target_uuid
1578 client_uuid = generate_client_uuid(self.name)
1579 client_uuid = self.name + "_lmv_" + "UUID"
1580 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1581 self.master_mds = self.master.name
1584 self.add_lustre_module('mdc', 'mdc')
1585 self.add_lustre_module('osc', 'osc')
1586 self.add_lustre_module('lov', 'lov')
1587 self.add_lustre_module('lmv', 'lmv')
1588 self.add_lustre_module('ost', 'ost')
1589 self.add_lustre_module('mds', 'mds')
1591 if self.fstype == 'smfs':
1592 self.add_lustre_module('smfs', 'smfs')
1594 if self.fstype == 'ldiskfs':
1595 self.add_lustre_module('ldiskfs', 'ldiskfs')
1598 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1600 # if fstype is smfs, then we should also take care about backing
1602 if self.fstype == 'smfs':
1603 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1605 def load_module(self):
1607 Module.load_module(self)
1610 if not config.record and is_prepared(self.name):
1613 debug(self.uuid, "not active")
1616 # run write_conf automatically, if --reformat used
1618 self.info(self.devpath, self.fstype, self.size, self.format)
1622 self.master.prepare()
1623 # never reformat here
1624 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1625 self.format, self.journal_size, self.inode_size,
1626 self.mkfsoptions, self.backfstype, self.backdevpath)
1628 if not is_prepared('MDT'):
1629 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1631 mountfsoptions = def_mount_options(self.fstype, 'mds')
1633 if config.mountfsoptions:
1635 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1637 mountfsoptions = config.mountfsoptions
1638 if self.mountfsoptions:
1639 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1641 if self.mountfsoptions:
1643 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1645 mountfsoptions = self.mountfsoptions
1647 if self.fstype == 'smfs':
1648 realdev = self.fstype
1651 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1655 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1660 print 'MDS mount options: ' + mountfsoptions
1662 if not self.master_mds:
1663 self.master_mds = 'dumb'
1664 if not self.cachetype:
1665 self.cachetype = 'dumb'
1666 lctl.newdev("mds", self.name, self.uuid,
1667 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1668 self.name, mountfsoptions,
1669 self.master_mds, self.cachetype))
1670 except CommandError, e:
1672 panic("MDS is missing the config log. Need to run " +
1673 "lconf --write_conf.")
1677 def write_conf(self):
1679 if not is_prepared(self.name):
1680 self.info(self.devpath, self.fstype, self.format)
1682 blkdev = block_dev(self.devpath, self.size, self.fstype,
1683 config.reformat, self.format, self.journal_size,
1684 self.inode_size, self.mkfsoptions,
1685 self.backfstype, self.backdevpath)
1687 # Even for writing logs we mount mds with supplied mount options
1688 # because it will not mount smfs (if used) otherwise.
1690 mountfsoptions = def_mount_options(self.fstype, 'mds')
1692 if config.mountfsoptions:
1694 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1696 mountfsoptions = config.mountfsoptions
1697 if self.mountfsoptions:
1698 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1700 if self.mountfsoptions:
1702 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1704 mountfsoptions = self.mountfsoptions
1706 if self.fstype == 'smfs':
1707 realdev = self.fstype
1710 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1714 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1719 print 'MDS mount options: ' + mountfsoptions
1721 # As mount options are passed by 4th param to config tool, we need
1722 # to pass something in 3rd param. But we do not want this 3rd param
1723 # be counted as a profile name for reading log on MDS setup, thus,
1724 # we pass there some predefined sign like 'dumb', which will be
1725 # checked in MDS code and skipped. Probably there is more nice way
1726 # like pass empty string and check it in config tool and pass null
1728 lctl.newdev("mds", self.name, self.uuid,
1729 setup ="%s %s %s %s" %(realdev, self.fstype,
1730 'dumb', mountfsoptions))
1733 # record logs for the MDS lov
1734 for uuid in self.filesystem_uuids:
1735 log("recording clients for filesystem:", uuid)
1736 fs = self.db.lookup(uuid)
1738 # this is ugly, should be organized nice later.
1739 target_uuid = self.db.get_first_ref('target')
1740 mds = self.db.lookup(target_uuid)
1742 lovconfig_uuid = mds.get_first_ref('lovconfig')
1744 lovconfig = mds.lookup(lovconfig_uuid)
1745 obd_uuid = lovconfig.get_first_ref('lov')
1747 obd_uuid = fs.get_first_ref('obd')
1749 client_uuid = generate_client_uuid(self.name)
1750 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1753 lctl.clear_log(self.name, self.name)
1754 lctl.record(self.name, self.name)
1756 lctl.mount_option(self.name, client.get_name(), "")
1758 process_updates(self.db, self.name, self.name, client)
1761 lctl.clear_log(self.name, self.name + '-clean')
1762 lctl.record(self.name, self.name + '-clean')
1764 lctl.del_mount_option(self.name)
1766 process_updates(self.db, self.name, self.name + '-clean', client)
1770 # record logs for each client
1776 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1778 config_options = CONFIG_FILE
1780 for node_db in self.db.lookup_class('node'):
1781 client_name = node_db.getName()
1782 for prof_uuid in node_db.get_refs('profile'):
1783 prof_db = node_db.lookup(prof_uuid)
1784 # refactor this into a funtion to test "clientness"
1786 for ref_class, ref_uuid in prof_db.get_all_refs():
1787 if ref_class in ('mountpoint','echoclient'):
1788 debug("recording", client_name)
1789 old_noexec = config.noexec
1791 ret, out = run (sys.argv[0], noexec_opt,
1792 " -v --record --nomod",
1793 "--record_log", client_name,
1794 "--record_device", self.name,
1795 "--node", client_name,
1798 for s in out: log("record> ", string.strip(s))
1799 ret, out = run (sys.argv[0], noexec_opt,
1800 "--cleanup -v --record --nomod",
1801 "--record_log", client_name + "-clean",
1802 "--record_device", self.name,
1803 "--node", client_name,
1806 for s in out: log("record> ", string.strip(s))
1807 config.noexec = old_noexec
1810 lctl.cleanup(self.name, self.uuid, 0, 0)
1811 except CommandError, e:
1812 log(self.module_name, "cleanup failed: ", self.name)
1815 Module.cleanup(self)
1817 if self.fstype == 'smfs':
1818 clean_loop(self.backdevpath)
1820 clean_loop(self.devpath)
1822 def msd_remaining(self):
1823 out = lctl.device_list()
1825 if string.split(s)[2] in ('mds',):
1828 def safe_to_clean(self):
1831 def safe_to_clean_modules(self):
1832 return not self.msd_remaining()
1836 debug(self.uuid, "not active")
1839 if is_prepared(self.name):
1841 lctl.cleanup(self.name, self.uuid, config.force,
1843 except CommandError, e:
1844 log(self.module_name, "cleanup failed: ", self.name)
1847 Module.cleanup(self)
1850 self.master.cleanup()
1851 if not self.msd_remaining() and is_prepared('MDT'):
1853 lctl.cleanup("MDT", "MDT_UUID", config.force,
1855 except CommandError, e:
1856 print "cleanup failed: ", self.name
1860 if self.fstype == 'smfs':
1861 clean_loop(self.backdevpath)
1863 clean_loop(self.devpath)
1865 def correct_level(self, level, op=None):
1866 #if self.master_mds:
1871 def __init__(self, db):
1872 Module.__init__(self, 'OSD', db)
1873 self.osdtype = self.db.get_val('osdtype')
1874 self.devpath = self.db.get_val('devpath', '')
1875 self.backdevpath = self.db.get_val('backdevpath', '')
1876 self.size = self.db.get_val_int('devsize', 0)
1877 self.journal_size = self.db.get_val_int('journalsize', 0)
1878 self.inode_size = self.db.get_val_int('inodesize', 0)
1879 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1880 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1881 self.fstype = self.db.get_val('fstype', '')
1882 self.backfstype = self.db.get_val('backfstype', '')
1883 self.nspath = self.db.get_val('nspath', '')
1884 target_uuid = self.db.get_first_ref('target')
1885 ost = self.db.lookup(target_uuid)
1886 self.name = ost.getName()
1887 self.format = self.db.get_val('autoformat', 'yes')
1888 if ost.get_val('failover', 0):
1889 self.failover_ost = 'f'
1891 self.failover_ost = 'n'
1893 active_uuid = get_active_target(ost)
1895 panic("No target device found:", target_uuid)
1896 if active_uuid == self.uuid:
1900 if self.active and config.group and config.group != ost.get_val('group'):
1903 self.target_dev_uuid = self.uuid
1904 self.uuid = target_uuid
1906 self.add_lustre_module('ost', 'ost')
1907 if self.fstype == 'smfs':
1908 self.add_lustre_module('smfs', 'smfs')
1909 # FIXME: should we default to ext3 here?
1910 if self.fstype == 'ldiskfs':
1911 self.add_lustre_module('ldiskfs', 'ldiskfs')
1913 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1914 if self.fstype == 'smfs':
1915 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1917 self.add_lustre_module(self.osdtype, self.osdtype)
1919 def load_module(self):
1921 Module.load_module(self)
1923 # need to check /proc/mounts and /etc/mtab before
1924 # formatting anything.
1925 # FIXME: check if device is already formatted.
1927 if is_prepared(self.name):
1930 debug(self.uuid, "not active")
1932 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1933 self.format, self.journal_size, self.inode_size)
1935 if self.osdtype == 'obdecho':
1938 blkdev = block_dev(self.devpath, self.size, self.fstype,
1939 config.reformat, self.format, self.journal_size,
1940 self.inode_size, self.mkfsoptions, self.backfstype,
1943 mountfsoptions = def_mount_options(self.fstype, 'ost')
1945 if config.mountfsoptions:
1947 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1949 mountfsoptions = config.mountfsoptions
1950 if self.mountfsoptions:
1951 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1953 if self.mountfsoptions:
1955 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1957 mountfsoptions = self.mountfsoptions
1959 if self.fstype == 'smfs':
1960 realdev = self.fstype
1963 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1967 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1972 print 'OSD mount options: ' + mountfsoptions
1974 lctl.newdev(self.osdtype, self.name, self.uuid,
1975 setup ="%s %s %s %s" %(realdev, self.fstype,
1978 if not is_prepared('OSS'):
1979 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1981 def osd_remaining(self):
1982 out = lctl.device_list()
1984 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1987 def safe_to_clean(self):
1990 def safe_to_clean_modules(self):
1991 return not self.osd_remaining()
1995 debug(self.uuid, "not active")
1997 if is_prepared(self.name):
2000 lctl.cleanup(self.name, self.uuid, config.force,
2002 except CommandError, e:
2003 log(self.module_name, "cleanup failed: ", self.name)
2006 if not self.osd_remaining() and is_prepared('OSS'):
2008 lctl.cleanup("OSS", "OSS_UUID", config.force,
2010 except CommandError, e:
2011 print "cleanup failed: ", self.name
2014 if not self.osdtype == 'obdecho':
2015 if self.fstype == 'smfs':
2016 clean_loop(self.backdevpath)
2018 clean_loop(self.devpath)
2020 def correct_level(self, level, op=None):
2023 def mgmt_uuid_for_fs(mtpt_name):
2026 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2027 fs_uuid = mtpt_db.get_first_ref('filesystem')
2028 fs = toplustreDB.lookup(fs_uuid)
2031 return fs.get_first_ref('mgmt')
2033 # Generic client module, used by OSC and MDC
2034 class Client(Module):
2035 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2037 self.target_name = tgtdb.getName()
2038 self.target_uuid = tgtdb.getUUID()
2042 self.tgt_dev_uuid = get_active_target(tgtdb)
2043 if not self.tgt_dev_uuid:
2044 panic("No target device found for target(1):", self.target_name)
2046 self.kmod = kmod(config.lustre, config.portals)
2050 self.module = module
2051 self.module_name = string.upper(module)
2053 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2054 self.target_name, fs_name)
2056 self.name = self_name
2058 self.lookup_server(self.tgt_dev_uuid)
2059 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2061 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2064 self.fs_name = fs_name
2067 self.add_lustre_module(module_dir, module)
2069 def lookup_server(self, srv_uuid):
2070 """ Lookup a server's network information """
2071 self._server_nets = get_ost_net(self.db, srv_uuid)
2072 if len(self._server_nets) == 0:
2073 panic ("Unable to find a server for:", srv_uuid)
2076 def get_servers(self):
2077 return self._server_nets
2079 def prepare(self, ignore_connect_failure = 0):
2080 self.info(self.target_uuid)
2081 if not config.record and is_prepared(self.name):
2084 srv = choose_local_server(self.get_servers())
2088 routes = find_route(self.get_servers())
2089 if len(routes) == 0:
2090 panic ("no route to", self.target_uuid)
2091 for (srv, r) in routes:
2092 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2093 except CommandError, e:
2094 if not ignore_connect_failure:
2097 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2098 debug("%s inactive" % self.target_uuid)
2099 inactive_p = "inactive"
2101 debug("%s active" % self.target_uuid)
2103 lctl.newdev(self.module, self.name, self.uuid,
2104 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2105 inactive_p, self.mgmt_name))
2108 if is_prepared(self.name):
2109 Module.cleanup(self)
2111 srv = choose_local_server(self.get_servers())
2113 lctl.disconnect(srv)
2115 for (srv, r) in find_route(self.get_servers()):
2116 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2117 except CommandError, e:
2118 log(self.module_name, "cleanup failed: ", self.name)
2122 def correct_level(self, level, op=None):
2125 def deactivate(self):
2127 lctl.deactivate(self.name)
2128 except CommandError, e:
2129 log(self.module_name, "deactivate failed: ", self.name)
2134 def __init__(self, db, uuid, fs_name):
2135 Client.__init__(self, db, uuid, 'mdc', fs_name)
2137 def permits_inactive(self):
2141 def __init__(self, db, uuid, fs_name):
2142 Client.__init__(self, db, uuid, 'osc', fs_name)
2144 def permits_inactive(self):
2147 def mgmtcli_name_for_uuid(uuid):
2148 return 'MGMTCLI_%s' % uuid
2150 class ManagementClient(Client):
2151 def __init__(self, db, uuid):
2152 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2153 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2154 module_dir = 'mgmt')
2156 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
2157 Module.__init__(self, 'VLOV', db)
2158 if name_override != None:
2159 self.name = "lov_%s" % name_override
2160 self.add_lustre_module('lov', 'lov')
2161 self.stripe_sz = 65536
2165 self.desc_uuid = self.uuid
2166 self.uuid = generate_client_uuid(self.name)
2167 self.fs_name = fs_name
2168 self.osc = get_osc(db, self.uuid, fs_name)
2170 panic('osc not found:', self.uuid)
2172 self.config_only = 1
2174 self.config_only = None
2180 if not config.record and is_prepared(self.name):
2182 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
2183 self.stripe_sz, self.stripe_off, self.pattern)
2184 target_uuid = self.osc.target_uuid
2187 self.osc.prepare(ignore_connect_failure=0)
2188 except CommandError, e:
2189 print "Error preparing OSC %s\n" % osc.uuid
2191 lctl.lov_add_obd(self.name, self.uuid, target_uuid, 0, 1)
2194 target_uuid = self.osc.target_uuid
2196 if is_prepared(self.name):
2197 Module.cleanup(self)
2198 if self.config_only:
2199 panic("Can't clean up config_only LOV ", self.name)
2201 def load_module(self):
2202 if self.config_only:
2203 panic("Can't load modules for config_only LOV ", self.name)
2204 self.osc.load_module()
2205 Module.load_module(self)
2207 def cleanup_module(self):
2208 if self.config_only:
2209 panic("Can't cleanup modules for config_only LOV ", self.name)
2210 Module.cleanup_module(self)
2211 self.osc.cleanup_module()
2213 def correct_level(self, level, op=None):
2216 class CMOBD(Module):
2217 def __init__(self,db):
2218 Module.__init__(self, 'CMOBD', db)
2219 self.name = self.db.getName();
2220 self.uuid = generate_client_uuid(self.name)
2221 self.master_uuid = self.db.get_first_ref('masterobd')
2222 self.cache_uuid = self.db.get_first_ref('cacheobd')
2223 self.add_lustre_module('cmobd', 'cmobd')
2224 master_obd = self.db.lookup(self.master_uuid)
2226 panic('master obd not found:', self.master_uuid)
2227 cache_obd = self.db.lookup(self.cache_uuid)
2229 panic('cache obd not found:', self.cache_uuid)
2231 if master_obd.get_class() == 'ost':
2232 self.client_uuid = generate_client_uuid(self.name)
2233 self.master= VLOV(master_obd, self.client_uuid, self.name,
2234 "%s_master" % (self.name))
2235 self.master_uuid = self.master.get_uuid()
2237 self.master = get_mdc(db, self.name, self.master_uuid)
2238 # need to check /proc/mounts and /etc/mtab before
2239 # formatting anything.
2240 # FIXME: check if device is already formatted.
2242 self.master.prepare()
2243 if not config.record and is_prepared(self.name):
2245 self.info(self.master_uuid, self.cache_uuid)
2246 lctl.newdev("cmobd", self.name, self.uuid,
2247 setup ="%s %s" %(self.master_uuid,
2251 if is_prepared(self.name):
2252 Module.cleanup(self)
2253 self.master.cleanup()
2255 def load_module(self):
2256 self.master.load_module()
2257 Module.load_module(self)
2259 def cleanup_module(self):
2260 Module.cleanup_module(self)
2261 self.master.cleanup_module()
2263 def correct_level(self, level, op=None):
2267 def __init__(self, db, uuid, name, type, name_override = None):
2268 Module.__init__(self, 'COBD', db)
2269 self.name = self.db.getName();
2270 self.uuid = generate_client_uuid(self.name)
2271 self.real_uuid = self.db.get_first_ref('realobd')
2272 self.cache_uuid = self.db.get_first_ref('cacheobd')
2273 self.add_lustre_module('cobd', 'cobd')
2274 real_obd = self.db.lookup(self.real_uuid)
2276 panic('real obd not found:', self.real_uuid)
2277 cache_obd = self.db.lookup(self.cache_uuid)
2279 panic('cache obd not found:', self.cache_uuid)
2281 self.real = LOV(real_obd, self.real_uuid, name,
2282 "%s_real" % (self.name));
2283 self.cache = LOV(cache_obd, self.cache_uuid, name,
2284 "%s_cache" % (self.name));
2286 self.real = get_mdc(db, name, self.real_uuid)
2287 self.cache = get_mdc(db, name, self.cache_uuid)
2288 # need to check /proc/mounts and /etc/mtab before
2289 # formatting anything.
2290 # FIXME: check if device is already formatted.
2295 def get_real_name(self):
2296 return self.real.name
2297 def get_cache_name(self):
2298 return self.cache.name
2301 self.cache.prepare()
2302 if not config.record and is_prepared(self.name):
2304 self.info(self.real_uuid, self.cache_uuid)
2305 lctl.newdev("cobd", self.name, self.uuid,
2306 setup ="%s %s" %(self.real.name,
2310 if is_prepared(self.name):
2311 Module.cleanup(self)
2313 self.cache.cleanup()
2315 def load_module(self):
2316 self.real.load_module()
2317 Module.load_module(self)
2319 def cleanup_module(self):
2320 Module.cleanup_module(self)
2321 self.real.cleanup_module()
2323 # virtual interface for OSC and LOV
2325 def __init__(self, db, client_uuid, name, name_override = None):
2326 Module.__init__(self, 'VOSC', db)
2327 if db.get_class() == 'lov':
2328 self.osc = LOV(db, client_uuid, name, name_override)
2330 elif db.get_class() == 'cobd':
2331 self.osc = COBD(db, client_uuid, name, 'obd')
2334 self.osc = OSC(db, client_uuid, name)
2337 return self.osc.get_uuid()
2339 return self.osc.get_name()
2344 def load_module(self):
2345 self.osc.load_module()
2346 def cleanup_module(self):
2347 self.osc.cleanup_module()
2348 def correct_level(self, level, op=None):
2349 return self.osc.correct_level(level, op)
2351 # virtual interface for MDC and LMV
2353 def __init__(self, db, client_uuid, name, name_override = None):
2354 Module.__init__(self, 'VMDC', db)
2355 if db.get_class() == 'lmv':
2356 self.mdc = LMV(db, client_uuid, name)
2357 elif db.get_class() == 'cobd':
2358 self.mdc = COBD(db, client_uuid, name, 'mds')
2360 self.mdc = MDC(db, client_uuid, name)
2362 return self.mdc.uuid
2364 return self.mdc.name
2369 def load_module(self):
2370 self.mdc.load_module()
2371 def cleanup_module(self):
2372 self.mdc.cleanup_module()
2373 def correct_level(self, level, op=None):
2374 return self.mdc.correct_level(level, op)
2376 class ECHO_CLIENT(Module):
2377 def __init__(self,db):
2378 Module.__init__(self, 'ECHO_CLIENT', db)
2379 self.add_lustre_module('obdecho', 'obdecho')
2380 self.obd_uuid = self.db.get_first_ref('obd')
2381 obd = self.db.lookup(self.obd_uuid)
2382 self.uuid = generate_client_uuid(self.name)
2383 self.osc = VOSC(obd, self.uuid, self.name)
2386 if not config.record and is_prepared(self.name):
2389 self.osc.prepare() # XXX This is so cheating. -p
2390 self.info(self.obd_uuid)
2392 lctl.newdev("echo_client", self.name, self.uuid,
2393 setup = self.osc.get_name())
2396 if is_prepared(self.name):
2397 Module.cleanup(self)
2400 def load_module(self):
2401 self.osc.load_module()
2402 Module.load_module(self)
2404 def cleanup_module(self):
2405 Module.cleanup_module(self)
2406 self.osc.cleanup_module()
2408 def correct_level(self, level, op=None):
2411 def generate_client_uuid(name):
2412 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2414 int(random.random() * 1048576),
2415 int(random.random() * 1048576))
2416 return client_uuid[:36]
2418 class Mountpoint(Module):
2419 def __init__(self,db):
2420 Module.__init__(self, 'MTPT', db)
2421 self.path = self.db.get_val('path')
2422 self.fs_uuid = self.db.get_first_ref('filesystem')
2423 fs = self.db.lookup(self.fs_uuid)
2424 self.mds_uuid = fs.get_first_ref('lmv')
2425 if not self.mds_uuid:
2426 self.mds_uuid = fs.get_first_ref('mds')
2427 self.obd_uuid = fs.get_first_ref('obd')
2428 self.mgmt_uuid = fs.get_first_ref('mgmt')
2429 client_uuid = generate_client_uuid(self.name)
2431 ost = self.db.lookup(self.obd_uuid)
2433 panic("no ost: ", self.obd_uuid)
2435 mds = self.db.lookup(self.mds_uuid)
2437 panic("no mds: ", self.mds_uuid)
2439 self.add_lustre_module('mdc', 'mdc')
2440 self.add_lustre_module('lmv', 'lmv')
2441 self.add_lustre_module('llite', 'llite')
2443 self.vosc = VOSC(ost, client_uuid, self.name)
2444 self.vmdc = VMDC(mds, client_uuid, self.name)
2447 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2453 if not config.record and fs_is_mounted(self.path):
2454 log(self.path, "already mounted.")
2458 self.mgmtcli.prepare()
2461 vmdc_name = self.vmdc.get_name()
2463 self.info(self.path, self.mds_uuid, self.obd_uuid)
2464 if config.record or config.lctl_dump:
2465 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2467 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2468 (self.vosc.get_name(), vmdc_name, config.config, self.path)
2469 run("mkdir", self.path)
2474 panic("mount failed:", self.path, ":", string.join(val))
2477 self.info(self.path, self.mds_uuid,self.obd_uuid)
2479 if config.record or config.lctl_dump:
2480 lctl.del_mount_option(local_node_name)
2482 if fs_is_mounted(self.path):
2484 (rc, out) = run("umount", "-f", self.path)
2486 (rc, out) = run("umount", self.path)
2488 raise CommandError('umount', out, rc)
2490 if fs_is_mounted(self.path):
2491 panic("fs is still mounted:", self.path)
2496 self.mgmtcli.cleanup()
2498 def load_module(self):
2500 self.mgmtcli.load_module()
2501 self.vosc.load_module()
2502 Module.load_module(self)
2504 def cleanup_module(self):
2505 Module.cleanup_module(self)
2506 self.vosc.cleanup_module()
2508 self.mgmtcli.cleanup_module()
2510 def correct_level(self, level, op=None):
2513 # ============================================================
2514 # misc query functions
2516 def get_ost_net(self, osd_uuid):
2520 osd = self.lookup(osd_uuid)
2521 node_uuid = osd.get_first_ref('node')
2522 node = self.lookup(node_uuid)
2524 panic("unable to find node for osd_uuid:", osd_uuid,
2525 " node_ref:", node_uuid_)
2526 for net_uuid in node.get_networks():
2527 db = node.lookup(net_uuid)
2528 srv_list.append(Network(db))
2532 # the order of iniitailization is based on level.
2533 def getServiceLevel(self):
2534 type = self.get_class()
2536 if type in ('network',):
2538 elif type in ('routetbl',):
2540 elif type in ('ldlm',):
2542 elif type in ('mgmt',):
2544 elif type in ('osd', 'cobd'):
2546 elif type in ('mdsdev',):
2548 elif type in ('lmv',):
2550 elif type in ('cmobd',):
2552 elif type in ('mountpoint', 'echoclient'):
2555 panic("Unknown type: ", type)
2557 if ret < config.minlevel or ret > config.maxlevel:
2562 # return list of services in a profile. list is a list of tuples
2563 # [(level, db_object),]
2564 def getServices(self):
2566 for ref_class, ref_uuid in self.get_all_refs():
2567 servdb = self.lookup(ref_uuid)
2569 level = getServiceLevel(servdb)
2571 list.append((level, servdb))
2573 panic('service not found: ' + ref_uuid)
2579 ############################################################
2581 # FIXME: clean this mess up!
2583 # OSC is no longer in the xml, so we have to fake it.
2584 # this is getting ugly and begging for another refactoring
2585 def get_osc(ost_db, uuid, fs_name):
2586 osc = OSC(ost_db, uuid, fs_name)
2589 def get_mdc(db, fs_name, mds_uuid):
2590 mds_db = db.lookup(mds_uuid);
2592 error("no mds:", mds_uuid)
2593 mdc = MDC(mds_db, mds_uuid, fs_name)
2596 ############################################################
2597 # routing ("rooting")
2598 # list of (nettype, cluster_id, nid)
2601 def find_local_clusters(node_db):
2602 global local_clusters
2603 for netuuid in node_db.get_networks():
2604 net = node_db.lookup(netuuid)
2606 debug("add_local", netuuid)
2607 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2609 if acceptors.has_key(srv.port):
2610 panic("duplicate port:", srv.port)
2611 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2612 srv.send_mem, srv.recv_mem,
2615 # This node is a gateway.
2617 def node_is_router():
2620 # If there are any routers found in the config, then this will be true
2621 # and all nodes will load kptlrouter.
2623 def node_needs_router():
2624 return needs_router or is_router
2626 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2627 # Currently, these local routes are only added to kptlrouter route
2628 # table if they are needed to connect to a specific server. This
2629 # should be changed so all available routes are loaded, and the
2630 # ptlrouter can make all the decisions.
2633 def find_local_routes(lustre):
2634 """ Scan the lustre config looking for routers . Build list of
2636 global local_routes, needs_router
2638 list = lustre.lookup_class('node')
2640 if router.get_val_int('router', 0):
2642 for (local_type, local_cluster_id, local_nid) in local_clusters:
2644 for netuuid in router.get_networks():
2645 db = router.lookup(netuuid)
2646 if (local_type == db.get_val('nettype') and
2647 local_cluster_id == db.get_val('clusterid')):
2648 gw = db.get_val('nid')
2651 debug("find_local_routes: gw is", gw)
2652 for route in router.get_local_routes(local_type, gw):
2653 local_routes.append(route)
2654 debug("find_local_routes:", local_routes)
2657 def choose_local_server(srv_list):
2658 for srv in srv_list:
2659 if local_cluster(srv.net_type, srv.cluster_id):
2662 def local_cluster(net_type, cluster_id):
2663 for cluster in local_clusters:
2664 if net_type == cluster[0] and cluster_id == cluster[1]:
2668 def local_interface(net_type, cluster_id, nid):
2669 for cluster in local_clusters:
2670 if (net_type == cluster[0] and cluster_id == cluster[1]
2671 and nid == cluster[2]):
2675 def find_route(srv_list):
2677 frm_type = local_clusters[0][0]
2678 for srv in srv_list:
2679 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2680 to_type = srv.net_type
2682 cluster_id = srv.cluster_id
2683 debug ('looking for route to', to_type, to)
2684 for r in local_routes:
2685 debug("find_route: ", r)
2686 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2687 result.append((srv, r))
2690 def get_active_target(db):
2691 target_uuid = db.getUUID()
2692 target_name = db.getName()
2693 node_name = get_select(target_name)
2695 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2697 tgt_dev_uuid = db.get_first_ref('active')
2700 def get_server_by_nid_uuid(db, nid_uuid):
2701 for n in db.lookup_class("network"):
2703 if net.nid_uuid == nid_uuid:
2707 ############################################################
2711 type = db.get_class()
2712 debug('Service:', type, db.getName(), db.getUUID())
2717 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2718 elif type == 'network':
2720 elif type == 'routetbl':
2724 elif type == 'cobd':
2725 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2726 elif type == 'cmobd':
2728 elif type == 'mdsdev':
2730 elif type == 'mountpoint':
2732 elif type == 'echoclient':
2734 elif type == 'mgmt':
2739 panic ("unknown service type:", type)
2743 # Prepare the system to run lustre using a particular profile
2744 # in a the configuration.
2745 # * load & the modules
2746 # * setup networking for the current node
2747 # * make sure partitions are in place and prepared
2748 # * initialize devices with lctl
2749 # Levels is important, and needs to be enforced.
2750 def for_each_profile(db, prof_list, operation):
2751 for prof_uuid in prof_list:
2752 prof_db = db.lookup(prof_uuid)
2754 panic("profile:", profile, "not found.")
2755 services = getServices(prof_db)
2758 def magic_get_osc(db, rec, lov):
2760 lov_uuid = lov.get_uuid()
2761 lov_name = lov.osc.fs_name
2763 lov_uuid = rec.getAttribute('lov_uuidref')
2764 # FIXME: better way to find the mountpoint?
2765 filesystems = db.root_node.getElementsByTagName('filesystem')
2767 for fs in filesystems:
2768 ref = fs.getElementsByTagName('obd_ref')
2769 if ref[0].getAttribute('uuidref') == lov_uuid:
2770 fsuuid = fs.getAttribute('uuid')
2774 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2776 mtpts = db.root_node.getElementsByTagName('mountpoint')
2779 ref = fs.getElementsByTagName('filesystem_ref')
2780 if ref[0].getAttribute('uuidref') == fsuuid:
2781 lov_name = fs.getAttribute('name')
2785 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2787 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2789 ost_uuid = rec.getAttribute('ost_uuidref')
2790 obd = db.lookup(ost_uuid)
2793 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
2795 osc = get_osc(obd, lov_uuid, lov_name)
2797 panic('osc not found:', obd_uuid)
2800 # write logs for update records. sadly, logs of all types -- and updates in
2801 # particular -- are something of an afterthought. lconf needs rewritten with
2802 # these as core concepts. so this is a pretty big hack.
2803 def process_update_record(db, update, lov):
2804 for rec in update.childNodes:
2805 if rec.nodeType != rec.ELEMENT_NODE:
2808 log("found "+rec.nodeName+" record in update version " +
2809 str(update.getAttribute('version')))
2811 lov_uuid = rec.getAttribute('lov_uuidref')
2812 ost_uuid = rec.getAttribute('ost_uuidref')
2813 index = rec.getAttribute('index')
2814 gen = rec.getAttribute('generation')
2816 if not lov_uuid or not ost_uuid or not index or not gen:
2817 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
2820 tmplov = db.lookup(lov_uuid)
2822 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
2823 lov_name = tmplov.getName()
2825 lov_name = lov.osc.name
2827 # ------------------------------------------------------------- add
2828 if rec.nodeName == 'add':
2830 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2833 osc = magic_get_osc(db, rec, lov)
2836 # Only ignore connect failures with --force, which
2837 # isn't implemented here yet.
2838 osc.prepare(ignore_connect_failure=0)
2839 except CommandError, e:
2840 print "Error preparing OSC %s\n" % osc.uuid
2843 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2845 # ------------------------------------------------------ deactivate
2846 elif rec.nodeName == 'deactivate':
2850 osc = magic_get_osc(db, rec, lov)
2854 except CommandError, e:
2855 print "Error deactivating OSC %s\n" % osc.uuid
2858 # ---------------------------------------------------------- delete
2859 elif rec.nodeName == 'delete':
2863 osc = magic_get_osc(db, rec, lov)
2869 except CommandError, e:
2870 print "Error cleaning up OSC %s\n" % osc.uuid
2873 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2875 def process_updates(db, log_device, log_name, lov = None):
2876 updates = db.root_node.getElementsByTagName('update')
2878 if not u.childNodes:
2879 log("ignoring empty update record (version " +
2880 str(u.getAttribute('version')) + ")")
2883 version = u.getAttribute('version')
2884 real_name = "%s-%s" % (log_name, version)
2885 lctl.clear_log(log_device, real_name)
2886 lctl.record(log_device, real_name)
2888 process_update_record(db, u, lov)
2892 def doWriteconf(services):
2896 if s[1].get_class() == 'mdsdev':
2897 n = newService(s[1])
2900 def doSetup(services):
2905 n = newService(s[1])
2907 slist.append((n.level, n))
2910 nl = n[1].correct_level(n[0])
2911 nlist.append((nl, n[1]))
2916 def doModules(services):
2920 n = newService(s[1])
2923 def doCleanup(services):
2928 n = newService(s[1])
2930 slist.append((n.level, n))
2933 nl = n[1].correct_level(n[0])
2934 nlist.append((nl, n[1]))
2938 if n[1].safe_to_clean():
2941 def doUnloadModules(services):
2946 n = newService(s[1])
2947 if n.safe_to_clean_modules():
2952 def doHost(lustreDB, hosts):
2953 global is_router, local_node_name
2956 node_db = lustreDB.lookup_name(h, 'node')
2960 panic('No host entry found.')
2962 local_node_name = node_db.get_val('name', 0)
2963 is_router = node_db.get_val_int('router', 0)
2964 lustre_upcall = node_db.get_val('lustreUpcall', '')
2965 portals_upcall = node_db.get_val('portalsUpcall', '')
2966 timeout = node_db.get_val_int('timeout', 0)
2967 ptldebug = node_db.get_val('ptldebug', '')
2968 subsystem = node_db.get_val('subsystem', '')
2970 find_local_clusters(node_db)
2972 find_local_routes(lustreDB)
2974 # Two step process: (1) load modules, (2) setup lustre
2975 # if not cleaning, load modules first.
2976 prof_list = node_db.get_refs('profile')
2978 if config.write_conf:
2980 for_each_profile(node_db, prof_list, doModules)
2982 for_each_profile(node_db, prof_list, doWriteconf)
2983 for_each_profile(node_db, prof_list, doUnloadModules)
2985 elif config.recover:
2986 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2987 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2988 "--client_uuid <UUID> --conn_uuid <UUID>")
2989 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2991 elif config.cleanup:
2993 # the command line can override this value
2995 # ugly hack, only need to run lctl commands for --dump
2996 if config.lctl_dump or config.record:
2997 for_each_profile(node_db, prof_list, doCleanup)
3000 sys_set_timeout(timeout)
3001 sys_set_ptldebug(ptldebug)
3002 sys_set_subsystem(subsystem)
3003 sys_set_lustre_upcall(lustre_upcall)
3004 sys_set_portals_upcall(portals_upcall)
3006 for_each_profile(node_db, prof_list, doCleanup)
3007 for_each_profile(node_db, prof_list, doUnloadModules)
3011 # ugly hack, only need to run lctl commands for --dump
3012 if config.lctl_dump or config.record:
3013 sys_set_timeout(timeout)
3014 sys_set_lustre_upcall(lustre_upcall)
3015 for_each_profile(node_db, prof_list, doSetup)
3019 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3020 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3022 for_each_profile(node_db, prof_list, doModules)
3024 sys_set_debug_path()
3025 sys_set_ptldebug(ptldebug)
3026 sys_set_subsystem(subsystem)
3027 script = config.gdb_script
3028 run(lctl.lctl, ' modules >', script)
3030 log ("The GDB module script is in", script)
3031 # pause, so user has time to break and
3034 sys_set_timeout(timeout)
3035 sys_set_lustre_upcall(lustre_upcall)
3036 sys_set_portals_upcall(portals_upcall)
3038 for_each_profile(node_db, prof_list, doSetup)
3041 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3042 tgt = lustreDB.lookup(tgt_uuid)
3044 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3045 new_uuid = get_active_target(tgt)
3047 raise Lustre.LconfError("doRecovery: no active target found for: " +
3049 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3051 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3053 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3055 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3058 lctl.disconnect(oldnet)
3059 except CommandError, e:
3060 log("recover: disconnect", nid_uuid, "failed: ")
3065 except CommandError, e:
3066 log("recover: connect failed")
3069 lctl.recover(client_uuid, net.nid_uuid)
3072 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3073 base = os.path.dirname(cmd)
3074 if development_mode():
3075 if not config.lustre:
3076 debug('using objdir module paths')
3077 config.lustre = (os.path.join(base, ".."))
3078 # normalize the portals dir, using command line arg if set
3080 portals_dir = config.portals
3081 dir = os.path.join(config.lustre, portals_dir)
3082 config.portals = dir
3083 debug('config.portals', config.portals)
3084 elif config.lustre and config.portals:
3086 # if --lustre and --portals, normalize portals
3087 # can ignore POTRALS_DIR here, since it is probly useless here
3088 config.portals = os.path.join(config.lustre, config.portals)
3089 debug('config.portals B', config.portals)
3091 def sysctl(path, val):
3092 debug("+ sysctl", path, val)
3096 fp = open(os.path.join('/proc/sys', path), 'w')
3103 def sys_set_debug_path():
3104 sysctl('portals/debug_path', config.debug_path)
3106 def sys_set_lustre_upcall(upcall):
3107 # the command overrides the value in the node config
3108 if config.lustre_upcall:
3109 upcall = config.lustre_upcall
3111 upcall = config.upcall
3113 lctl.set_lustre_upcall(upcall)
3115 def sys_set_portals_upcall(upcall):
3116 # the command overrides the value in the node config
3117 if config.portals_upcall:
3118 upcall = config.portals_upcall
3120 upcall = config.upcall
3122 sysctl('portals/upcall', upcall)
3124 def sys_set_timeout(timeout):
3125 # the command overrides the value in the node config
3126 if config.timeout and config.timeout > 0:
3127 timeout = config.timeout
3128 if timeout != None and timeout > 0:
3129 lctl.set_timeout(timeout)
3131 def sys_tweak_socknal ():
3132 if config.single_socket:
3133 sysctl("socknal/typed", 0)
3135 def sys_optimize_elan ():
3136 procfiles = ["/proc/elan/config/eventint_punt_loops",
3137 "/proc/qsnet/elan3/config/eventint_punt_loops",
3138 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3140 if os.access(p, os.R_OK):
3141 run ("echo 1 > " + p)
3143 def sys_set_ptldebug(ptldebug):
3145 ptldebug = config.ptldebug
3148 val = eval(ptldebug, ptldebug_names)
3149 val = "0x%x" % (val)
3150 sysctl('portals/debug', val)
3151 except NameError, e:
3154 def sys_set_subsystem(subsystem):
3155 if config.subsystem:
3156 subsystem = config.subsystem
3159 val = eval(subsystem, subsystem_names)
3160 val = "0x%x" % (val)
3161 sysctl('portals/subsystem_debug', val)
3162 except NameError, e:
3165 def sys_set_netmem_max(path, max):
3166 debug("setting", path, "to at least", max)
3174 fp = open(path, 'w')
3175 fp.write('%d\n' %(max))
3179 def sys_make_devices():
3180 if not os.access('/dev/portals', os.R_OK):
3181 run('mknod /dev/portals c 10 240')
3182 if not os.access('/dev/obd', os.R_OK):
3183 run('mknod /dev/obd c 10 241')
3186 # Add dir to the global PATH, if not already there.
3187 def add_to_path(new_dir):
3188 syspath = string.split(os.environ['PATH'], ':')
3189 if new_dir in syspath:
3191 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3193 def default_debug_path():
3194 path = '/tmp/lustre-log'
3195 if os.path.isdir('/r'):
3200 def default_gdb_script():
3201 script = '/tmp/ogdb'
3202 if os.path.isdir('/r'):
3203 return '/r' + script
3208 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3209 # ensure basic elements are in the system path
3210 def sanitise_path():
3211 for dir in DEFAULT_PATH:
3214 # global hack for the --select handling
3216 def init_select(args):
3217 # args = [service=nodeA,service2=nodeB service3=nodeC]
3220 list = string.split(arg, ',')
3222 srv, node = string.split(entry, '=')
3223 tgt_select[srv] = node
3225 def get_select(srv):
3226 if tgt_select.has_key(srv):
3227 return tgt_select[srv]
3231 FLAG = Lustre.Options.FLAG
3232 PARAM = Lustre.Options.PARAM
3233 INTPARAM = Lustre.Options.INTPARAM
3234 PARAMLIST = Lustre.Options.PARAMLIST
3236 ('verbose,v', "Print system commands as they are run"),
3237 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3238 ('config', "Cluster config name used for LDAP query", PARAM),
3239 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3240 ('node', "Load config for <nodename>", PARAM),
3241 ('cleanup,d', "Cleans up config. (Shutdown)"),
3242 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3244 ('single_socket', "socknal option: only use one socket instead of bundle",
3246 ('failover',"""Used to shut down without saving state.
3247 This will allow this node to "give up" a service to a
3248 another node for failover purposes. This will not
3249 be a clean shutdown.""",
3251 ('gdb', """Prints message after creating gdb module script
3252 and sleeps for 5 seconds."""),
3253 ('noexec,n', """Prints the commands and steps that will be run for a
3254 config without executing them. This can used to check if a
3255 config file is doing what it should be doing"""),
3256 ('nomod', "Skip load/unload module step."),
3257 ('nosetup', "Skip device setup/cleanup step."),
3258 ('reformat', "Reformat all devices (without question)"),
3259 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3260 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3261 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3263 ('write_conf', "Save all the client config information on mds."),
3264 ('record', "Write config information on mds."),
3265 ('record_log', "Name of config record log.", PARAM),
3266 ('record_device', "MDS device name that will record the config commands",
3268 ('minlevel', "Minimum level of services to configure/cleanup",
3270 ('maxlevel', """Maximum level of services to configure/cleanup
3271 Levels are aproximatly like:
3276 70 - mountpoint, echo_client, osc, mdc, lov""",
3278 ('lustre', """Base directory of lustre sources. This parameter will
3279 cause lconf to load modules from a source tree.""", PARAM),
3280 ('portals', """Portals source directory. If this is a relative path,
3281 then it is assumed to be relative to lustre. """, PARAM),
3282 ('timeout', "Set recovery timeout", INTPARAM),
3283 ('upcall', "Set both portals and lustre upcall script", PARAM),
3284 ('lustre_upcall', "Set lustre upcall script", PARAM),
3285 ('portals_upcall', "Set portals upcall script", PARAM),
3286 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3287 ('ptldebug', "Set the portals debug level", PARAM),
3288 ('subsystem', "Set the portals debug subsystem", PARAM),
3289 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3290 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3291 # Client recovery options
3292 ('recover', "Recover a device"),
3293 ('group', "The group of devices to configure or cleanup", PARAM),
3294 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3295 ('client_uuid', "The failed client (required for recovery)", PARAM),
3296 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3298 ('inactive', """The name of an inactive service, to be ignored during
3299 mounting (currently OST-only). Can be repeated.""",
3304 global lctl, config, toplustreDB, CONFIG_FILE
3306 # in the upcall this is set to SIG_IGN
3307 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3309 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3311 config, args = cl.parse(sys.argv[1:])
3312 except Lustre.OptionError, e:
3316 setupModulePath(sys.argv[0])
3318 host = socket.gethostname()
3320 # the PRNG is normally seeded with time(), which is not so good for starting
3321 # time-synchronized clusters
3322 input = open('/dev/urandom', 'r')
3324 print 'Unable to open /dev/urandom!'
3326 seed = input.read(32)
3332 init_select(config.select)
3335 # allow config to be fetched via HTTP, but only with python2
3336 if sys.version[0] != '1' and args[0].startswith('http://'):
3339 config_file = urllib2.urlopen(args[0])
3340 except (urllib2.URLError, socket.error), err:
3341 if hasattr(err, 'args'):
3343 print "Could not access '%s': %s" %(args[0], err)
3345 elif not os.access(args[0], os.R_OK):
3346 print 'File not found or readable:', args[0]
3350 config_file = open(args[0], 'r')
3352 dom = xml.dom.minidom.parse(config_file)
3354 panic("%s does not appear to be a config file." % (args[0]))
3355 sys.exit(1) # make sure to die here, even in debug mode.
3357 CONFIG_FILE = args[0]
3358 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3359 if not config.config:
3360 config.config = os.path.basename(args[0])# use full path?
3361 if config.config[-4:] == '.xml':
3362 config.config = config.config[:-4]
3363 elif config.ldapurl:
3364 if not config.config:
3365 panic("--ldapurl requires --config name")
3366 dn = "config=%s,fs=lustre" % (config.config)
3367 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3368 elif config.ptldebug or config.subsystem:
3369 sys_set_ptldebug(None)
3370 sys_set_subsystem(None)
3373 print 'Missing config file or ldap URL.'
3374 print 'see lconf --help for command summary'
3377 toplustreDB = lustreDB
3379 ver = lustreDB.get_version()
3381 panic("No version found in config data, please recreate.")
3382 if ver != Lustre.CONFIG_VERSION:
3383 panic("Config version", ver, "does not match lconf version",
3384 Lustre.CONFIG_VERSION)
3388 node_list.append(config.node)
3391 node_list.append(host)
3392 node_list.append('localhost')
3394 debug("configuring for host: ", node_list)
3397 config.debug_path = config.debug_path + '-' + host
3398 config.gdb_script = config.gdb_script + '-' + host
3400 lctl = LCTLInterface('lctl')
3402 if config.lctl_dump:
3403 lctl.use_save_file(config.lctl_dump)
3406 if not (config.record_device and config.record_log):
3407 panic("When recording, both --record_log and --record_device must be specified.")
3408 lctl.clear_log(config.record_device, config.record_log)
3409 lctl.record(config.record_device, config.record_log)
3411 doHost(lustreDB, node_list)
3413 if not config.record:
3418 process_updates(db, config.record_device, config.record_log)
3420 if __name__ == "__main__":
3423 except Lustre.LconfError, e:
3425 # traceback.print_exc(file=sys.stdout)
3427 except CommandError, e:
3431 if first_cleanup_error:
3432 sys.exit(first_cleanup_error)