3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
106 "socknal" : (1 << 11),
107 "qswnal" : (1 << 12),
108 "pinger" : (1 << 13),
109 "filter" : (1 << 14),
115 "ptlrouter" : (1 << 20),
122 first_cleanup_error = 0
123 def cleanup_error(rc):
124 global first_cleanup_error
125 if not first_cleanup_error:
126 first_cleanup_error = rc
128 # ============================================================
129 # debugging and error funcs
131 def fixme(msg = "this feature"):
132 raise Lustre.LconfError, msg + ' not implemented yet.'
135 msg = string.join(map(str,args))
136 if not config.noexec:
137 raise Lustre.LconfError(msg)
142 msg = string.join(map(str,args))
147 print string.strip(s)
151 msg = string.join(map(str,args))
154 # ack, python's builtin int() does not support '0x123' syntax.
155 # eval can do it, although what a hack!
159 return eval(s, {}, {})
162 except SyntaxError, e:
163 raise ValueError("not a number")
165 raise ValueError("not a number")
167 # ============================================================
168 # locally defined exceptions
169 class CommandError (exceptions.Exception):
170 def __init__(self, cmd_name, cmd_err, rc=None):
171 self.cmd_name = cmd_name
172 self.cmd_err = cmd_err
177 if type(self.cmd_err) == types.StringType:
179 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
181 print "! %s: %s" % (self.cmd_name, self.cmd_err)
182 elif type(self.cmd_err) == types.ListType:
184 print "! %s (error %d):" % (self.cmd_name, self.rc)
186 print "! %s:" % (self.cmd_name)
187 for s in self.cmd_err:
188 print "> %s" %(string.strip(s))
193 # ============================================================
194 # handle daemons, like the acceptor
196 """ Manage starting and stopping a daemon. Assumes daemon manages
197 it's own pid file. """
199 def __init__(self, cmd):
205 log(self.command, "already running.")
207 self.path = find_prog(self.command)
209 panic(self.command, "not found.")
210 ret, out = runcmd(self.path +' '+ self.command_line())
212 raise CommandError(self.path, out, ret)
216 pid = self.read_pidfile()
218 log ("killing process", pid)
220 #time.sleep(1) # let daemon die
222 log("unable to kill", self.command, e)
224 log("unable to kill", self.command)
227 pid = self.read_pidfile()
237 def read_pidfile(self):
239 fp = open(self.pidfile(), 'r')
246 def clean_pidfile(self):
247 """ Remove a stale pidfile """
248 log("removing stale pidfile:", self.pidfile())
250 os.unlink(self.pidfile())
252 log(self.pidfile(), e)
254 class AcceptorHandler(DaemonHandler):
255 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
256 DaemonHandler.__init__(self, "acceptor")
259 self.send_mem = send_mem
260 self.recv_mem = recv_mem
263 self.flags = self.flags + ' -i'
266 return "/var/run/%s-%d.pid" % (self.command, self.port)
268 def command_line(self):
269 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
273 # start the acceptors
275 if config.lctl_dump or config.record:
277 for port in acceptors.keys():
278 daemon = acceptors[port]
279 if not daemon.running():
282 def run_one_acceptor(port):
283 if config.lctl_dump or config.record:
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
287 if not daemon.running():
290 panic("run_one_acceptor: No acceptor defined for port:", port)
292 def stop_acceptor(port):
293 if acceptors.has_key(port):
294 daemon = acceptors[port]
299 # ============================================================
300 # handle lctl interface
303 Manage communication with lctl
306 def __init__(self, cmd):
308 Initialize close by finding the lctl binary.
310 self.lctl = find_prog(cmd)
312 self.record_device = ''
315 debug('! lctl not found')
318 raise CommandError('lctl', "unable to find lctl binary.")
320 def use_save_file(self, file):
321 self.save_file = file
323 def record(self, dev_name, logname):
324 log("Recording log", logname, "on", dev_name)
325 self.record_device = dev_name
326 self.record_log = logname
328 def end_record(self):
329 log("End recording log", self.record_log, "on", self.record_device)
330 self.record_device = None
331 self.record_log = None
333 def set_nonblock(self, fd):
334 fl = fcntl.fcntl(fd, F_GETFL)
335 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
340 the cmds are written to stdin of lctl
341 lctl doesn't return errors when run in script mode, so
343 should modify command line to accept multiple commands, or
344 create complex command line options
348 cmds = '\n dump ' + self.save_file + '\n' + cmds
349 elif self.record_device:
353 %s""" % (self.record_device, self.record_log, cmds)
355 debug("+", cmd_line, cmds)
356 if config.noexec: return (0, [])
358 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
359 child.tochild.write(cmds + "\n")
360 child.tochild.close()
361 # print "LCTL:", cmds
363 # From "Python Cookbook" from O'Reilly
364 outfile = child.fromchild
365 outfd = outfile.fileno()
366 self.set_nonblock(outfd)
367 errfile = child.childerr
368 errfd = errfile.fileno()
369 self.set_nonblock(errfd)
371 outdata = errdata = ''
374 ready = select.select([outfd,errfd],[],[]) # Wait for input
375 if outfd in ready[0]:
376 outchunk = outfile.read()
377 if outchunk == '': outeof = 1
378 outdata = outdata + outchunk
379 if errfd in ready[0]:
380 errchunk = errfile.read()
381 if errchunk == '': erreof = 1
382 errdata = errdata + errchunk
383 if outeof and erreof: break
384 # end of "borrowed" code
387 if os.WIFEXITED(ret):
388 rc = os.WEXITSTATUS(ret)
391 if rc or len(errdata):
392 raise CommandError(self.lctl, errdata, rc)
395 def runcmd(self, *args):
397 run lctl using the command line
399 cmd = string.join(map(str,args))
400 debug("+", self.lctl, cmd)
401 rc, out = run(self.lctl, cmd)
403 raise CommandError(self.lctl, out, rc)
407 def clear_log(self, dev, log):
408 """ clear an existing log """
413 quit """ % (dev, log)
416 def network(self, net, nid):
421 quit """ % (net, nid)
424 # create a new connection
425 def add_uuid(self, net_type, uuid, nid):
426 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
429 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
431 if net_type in ('tcp',) and not config.lctl_dump:
436 add_autoconn %s %s %d %s
440 nid, hostaddr, port, flags )
443 def connect(self, srv):
444 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
445 if srv.net_type in ('tcp',) and not config.lctl_dump:
449 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
450 srv.nid, srv.hostaddr, srv.port, flags)
453 def recover(self, dev_name, new_conn):
456 recover %s""" %(dev_name, new_conn)
459 # add a route to a range
460 def add_route(self, net, gw, lo, hi):
468 except CommandError, e:
472 def del_route(self, net, gw, lo, hi):
477 quit """ % (net, gw, lo, hi)
480 # add a route to a host
481 def add_route_host(self, net, uuid, gw, tgt):
482 self.add_uuid(net, uuid, tgt)
490 except CommandError, e:
494 # add a route to a range
495 def del_route_host(self, net, uuid, gw, tgt):
501 quit """ % (net, gw, tgt)
505 def del_autoconn(self, net_type, nid, hostaddr):
506 if net_type in ('tcp',) and not config.lctl_dump:
515 # disconnect one connection
516 def disconnect(self, srv):
517 self.del_uuid(srv.nid_uuid)
518 if srv.net_type in ('tcp',) and not config.lctl_dump:
519 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
521 def del_uuid(self, uuid):
529 def disconnectAll(self, net):
537 def attach(self, type, name, uuid):
540 quit""" % (type, name, uuid)
543 def setup(self, name, setup = ""):
547 quit""" % (name, setup)
551 # create a new device with lctl
552 def newdev(self, type, name, uuid, setup = ""):
553 self.attach(type, name, uuid);
555 self.setup(name, setup)
556 except CommandError, e:
557 self.cleanup(name, uuid, 0)
562 def cleanup(self, name, uuid, force, failover = 0):
563 if failover: force = 1
569 quit""" % (name, ('', 'force')[force],
570 ('', 'failover')[failover])
574 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
575 stripe_sz, stripe_off, pattern):
578 lov_setup %s %d %d %d %s
579 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
582 # add an OBD to a LOV
583 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
585 lov_modify_tgts add %s %s %s %s
586 quit""" % (name, obd_uuid, index, gen)
590 def lmv_setup(self, name, uuid, desc_uuid, devlist):
594 quit""" % (name, uuid, desc_uuid, devlist)
597 # delete an OBD from a LOV
598 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
600 lov_modify_tgts del %s %s %s %s
601 quit""" % (name, obd_uuid, index, gen)
605 def deactivate(self, name):
613 def dump(self, dump_file):
616 quit""" % (dump_file)
619 # get list of devices
620 def device_list(self):
621 devices = '/proc/fs/lustre/devices'
623 if os.access(devices, os.R_OK):
625 fp = open(devices, 'r')
633 def lustre_version(self):
634 rc, out = self.runcmd('version')
638 def mount_option(self, profile, osc, mdc):
640 mount_option %s %s %s
641 quit""" % (profile, osc, mdc)
644 # delete mount options
645 def del_mount_option(self, profile):
651 def set_timeout(self, timeout):
657 def set_lustre_upcall(self, upcall):
662 # ============================================================
663 # Various system-level functions
664 # (ideally moved to their own module)
666 # Run a command and return the output and status.
667 # stderr is sent to /dev/null, could use popen3 to
668 # save it if necessary
671 if config.noexec: return (0, [])
672 f = os.popen(cmd + ' 2>&1')
682 cmd = string.join(map(str,args))
685 # Run a command in the background.
686 def run_daemon(*args):
687 cmd = string.join(map(str,args))
689 if config.noexec: return 0
690 f = os.popen(cmd + ' 2>&1')
698 # Determine full path to use for an external command
699 # searches dirname(argv[0]) first, then PATH
701 syspath = string.split(os.environ['PATH'], ':')
702 cmdpath = os.path.dirname(sys.argv[0])
703 syspath.insert(0, cmdpath);
705 syspath.insert(0, os.path.join(config.portals, 'utils/'))
707 prog = os.path.join(d,cmd)
708 if os.access(prog, os.X_OK):
712 # Recursively look for file starting at base dir
713 def do_find_file(base, mod):
714 fullname = os.path.join(base, mod)
715 if os.access(fullname, os.R_OK):
717 for d in os.listdir(base):
718 dir = os.path.join(base,d)
719 if os.path.isdir(dir):
720 module = do_find_file(dir, mod)
724 def find_module(src_dir, dev_dir, modname):
725 modbase = src_dir +'/'+ dev_dir +'/'+ modname
726 for modext in '.ko', '.o':
727 module = modbase + modext
729 if os.access(module, os.R_OK):
735 # is the path a block device?
742 return stat.S_ISBLK(s[stat.ST_MODE])
744 # build fs according to type
746 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
752 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
754 # devsize is in 1k, and fs block count is in 4k
755 block_cnt = devsize/4
757 if fstype in ('ext3', 'extN', 'ldiskfs'):
758 # ext3 journal size is in megabytes
761 if not is_block(dev):
762 ret, out = runcmd("ls -l %s" %dev)
763 devsize = int(string.split(out[0])[4]) / 1024
765 ret, out = runcmd("sfdisk -s %s" %dev)
766 devsize = int(out[0])
767 if devsize > 1024 * 1024:
768 jsize = ((devsize / 102400) * 4)
771 if jsize: jopt = "-J size=%d" %(jsize,)
772 if isize: iopt = "-I %d" %(isize,)
773 mkfs = 'mkfs.ext2 -j -b 4096 '
774 if not isblock or config.force:
776 elif fstype == 'reiserfs':
777 # reiserfs journal size is in blocks
778 if jsize: jopt = "--journal_size %d" %(jsize,)
779 mkfs = 'mkreiserfs -ff'
781 panic('unsupported fs type: ', fstype)
783 if config.mkfsoptions != None:
784 mkfs = mkfs + ' ' + config.mkfsoptions
785 if mkfsoptions != None:
786 mkfs = mkfs + ' ' + mkfsoptions
787 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
789 panic("Unable to build fs:", dev, string.join(out))
790 # enable hash tree indexing on fsswe
791 if fstype in ('ext3', 'extN', 'ldiskfs'):
792 htree = 'echo "feature FEATURE_C5" | debugfs -w'
793 (ret, out) = run (htree, dev)
795 panic("Unable to enable htree:", dev)
797 # some systems use /dev/loopN, some /dev/loop/N
801 if not os.access(loop + str(0), os.R_OK):
803 if not os.access(loop + str(0), os.R_OK):
804 panic ("can't access loop devices")
807 # find loop device assigned to the file
808 def find_assigned_loop(file):
810 for n in xrange(0, MAX_LOOP_DEVICES):
812 if os.access(dev, os.R_OK):
813 (stat, out) = run('losetup', dev)
814 if out and stat == 0:
815 m = re.search(r'\((.*)\)', out[0])
816 if m and file == m.group(1):
822 # create file if necessary and assign the first free loop device
823 def init_loop(file, size, fstype, journal_size, inode_size,
824 mkfsoptions, reformat, autoformat, backfstype, backfile):
827 realfstype = backfstype
828 if is_block(backfile):
829 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
830 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
836 dev = find_assigned_loop(realfile)
838 print 'WARNING file:', realfile, 'already mapped to', dev
841 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
843 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
844 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
846 panic("Unable to create backing store:", realfile)
848 mkfs(realfile, size, realfstype, journal_size, inode_size,
849 mkfsoptions, isblock=0)
852 # find next free loop
853 for n in xrange(0, MAX_LOOP_DEVICES):
855 if os.access(dev, os.R_OK):
856 (stat, out) = run('losetup', dev)
858 run('losetup', dev, realfile)
861 print "out of loop devices"
863 print "out of loop devices"
866 # undo loop assignment
867 def clean_loop(file):
868 dev = find_assigned_loop(file)
870 ret, out = run('losetup -d', dev)
872 log('unable to clean loop device:', dev, 'for file:', file)
875 # determine if dev is formatted as a <fstype> filesystem
876 def need_format(fstype, dev):
877 # FIXME don't know how to implement this
880 # initialize a block device if needed
881 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
882 inode_size, mkfsoptions, backfstype, backdev):
886 if fstype == 'smfs' or not is_block(dev):
887 dev = init_loop(dev, size, fstype, journal_size, inode_size,
888 mkfsoptions, reformat, autoformat, backfstype, backdev)
889 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
890 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
893 # panic("device:", dev,
894 # "not prepared, and autoformat is not set.\n",
895 # "Rerun with --reformat option to format ALL filesystems")
900 """lookup IP address for an interface"""
901 rc, out = run("/sbin/ifconfig", iface)
904 addr = string.split(out[1])[1]
905 ip = string.split(addr, ':')[1]
908 def def_mount_options(fstype, target):
909 """returns deafult mount options for passed fstype and target (mds, ost)"""
910 if fstype == 'ext3' or fstype == 'ldiskfs':
911 mountfsoptions = "errors=remount-ro"
912 if target == 'ost' and sys_get_branch() == '2.4':
913 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
914 return mountfsoptions
917 def sys_get_elan_position_file():
918 procfiles = ["/proc/elan/device0/position",
919 "/proc/qsnet/elan4/device0/position",
920 "/proc/qsnet/elan3/device0/position"]
922 if os.access(p, os.R_OK):
926 def sys_get_local_nid(net_type, wildcard, cluster_id):
927 """Return the local nid."""
929 if sys_get_elan_position_file():
930 local = sys_get_local_address('elan', '*', cluster_id)
932 local = sys_get_local_address(net_type, wildcard, cluster_id)
935 def sys_get_local_address(net_type, wildcard, cluster_id):
936 """Return the local address for the network type."""
938 if net_type in ('tcp',):
940 iface, star = string.split(wildcard, ':')
941 local = if2addr(iface)
943 panic ("unable to determine ip for:", wildcard)
945 host = socket.gethostname()
946 local = socket.gethostbyname(host)
947 elif net_type == 'elan':
948 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
949 f = sys_get_elan_position_file()
951 panic ("unable to determine local Elan ID")
954 lines = fp.readlines()
962 nid = my_int(cluster_id) + my_int(elan_id)
964 except ValueError, e:
968 elif net_type == 'gm':
969 fixme("automatic local address for GM")
973 def sys_get_branch():
974 """Returns kernel release"""
976 fp = open('/proc/sys/kernel/osrelease')
977 lines = fp.readlines()
981 version = string.split(l)
982 a = string.split(version[0], '.')
983 return a[0] + '.' + a[1]
989 def mod_loaded(modname):
990 """Check if a module is already loaded. Look in /proc/modules for it."""
992 fp = open('/proc/modules')
993 lines = fp.readlines()
995 # please forgive my tired fingers for this one
996 ret = filter(lambda word, mod=modname: word == mod,
997 map(lambda line: string.split(line)[0], lines))
1002 # XXX: instead of device_list, ask for $name and see what we get
1003 def is_prepared(name):
1004 """Return true if a device exists for the name"""
1005 if config.lctl_dump:
1007 if (config.noexec or config.record) and config.cleanup:
1010 # expect this format:
1011 # 1 UP ldlm ldlm ldlm_UUID 2
1012 out = lctl.device_list()
1014 if name == string.split(s)[3]:
1016 except CommandError, e:
1020 def is_network_prepared():
1021 """If the any device exists, then assume that all networking
1022 has been configured"""
1023 out = lctl.device_list()
1026 def fs_is_mounted(path):
1027 """Return true if path is a mounted lustre filesystem"""
1029 fp = open('/proc/mounts')
1030 lines = fp.readlines()
1034 if a[1] == path and a[2] == 'lustre_lite':
1042 """Manage kernel modules"""
1043 def __init__(self, lustre_dir, portals_dir):
1044 self.lustre_dir = lustre_dir
1045 self.portals_dir = portals_dir
1046 self.kmodule_list = []
1048 def add_portals_module(self, dev_dir, modname):
1049 """Append a module to list of modules to load."""
1050 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1052 def add_lustre_module(self, dev_dir, modname):
1053 """Append a module to list of modules to load."""
1054 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1056 def load_module(self):
1057 """Load all the modules in the list in the order they appear."""
1058 for src_dir, dev_dir, mod in self.kmodule_list:
1059 if mod_loaded(mod) and not config.noexec:
1061 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1063 module = find_module(src_dir, dev_dir, mod)
1065 panic('module not found:', mod)
1066 (rc, out) = run('/sbin/insmod', module)
1068 raise CommandError('insmod', out, rc)
1070 (rc, out) = run('/sbin/modprobe', mod)
1072 raise CommandError('modprobe', out, rc)
1074 def cleanup_module(self):
1075 """Unload the modules in the list in reverse order."""
1076 rev = self.kmodule_list
1078 for src_dir, dev_dir, mod in rev:
1079 if not mod_loaded(mod) and not config.noexec:
1082 if mod == 'portals' and config.dump:
1083 lctl.dump(config.dump)
1084 log('unloading module:', mod)
1085 (rc, out) = run('/sbin/rmmod', mod)
1087 log('! unable to unload module:', mod)
1090 # ============================================================
1091 # Classes to prepare and cleanup the various objects
1094 """ Base class for the rest of the modules. The default cleanup method is
1095 defined here, as well as some utilitiy funcs.
1097 def __init__(self, module_name, db):
1099 self.module_name = module_name
1100 self.name = self.db.getName()
1101 self.uuid = self.db.getUUID()
1104 self.kmod = kmod(config.lustre, config.portals)
1106 def info(self, *args):
1107 msg = string.join(map(str,args))
1108 print self.module_name + ":", self.name, self.uuid, msg
1111 """ default cleanup, used for most modules """
1114 lctl.cleanup(self.name, self.uuid, config.force)
1115 except CommandError, e:
1116 log(self.module_name, "cleanup failed: ", self.name)
1120 def add_portals_module(self, dev_dir, modname):
1121 """Append a module to list of modules to load."""
1122 self.kmod.add_portals_module(dev_dir, modname)
1124 def add_lustre_module(self, dev_dir, modname):
1125 """Append a module to list of modules to load."""
1126 self.kmod.add_lustre_module(dev_dir, modname)
1128 def load_module(self):
1129 """Load all the modules in the list in the order they appear."""
1130 self.kmod.load_module()
1132 def cleanup_module(self):
1133 """Unload the modules in the list in reverse order."""
1134 if self.safe_to_clean():
1135 self.kmod.cleanup_module()
1137 def safe_to_clean(self):
1140 def safe_to_clean_modules(self):
1141 return self.safe_to_clean()
1143 class Network(Module):
1144 def __init__(self,db):
1145 Module.__init__(self, 'NETWORK', db)
1146 self.net_type = self.db.get_val('nettype')
1147 self.nid = self.db.get_val('nid', '*')
1148 self.cluster_id = self.db.get_val('clusterid', "0")
1149 self.port = self.db.get_val_int('port', 0)
1150 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1151 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1152 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1155 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1157 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1158 self.generic_nid = 1
1159 debug("nid:", self.nid)
1161 self.generic_nid = 0
1163 self.nid_uuid = self.nid_to_uuid(self.nid)
1165 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1166 if '*' in self.hostaddr:
1167 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1168 if not self.hostaddr:
1169 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1170 debug("hostaddr:", self.hostaddr)
1172 self.add_portals_module("libcfs", 'libcfs')
1173 self.add_portals_module("portals", 'portals')
1174 if node_needs_router():
1175 self.add_portals_module("router", 'kptlrouter')
1176 if self.net_type == 'tcp':
1177 self.add_portals_module("knals/socknal", 'ksocknal')
1178 if self.net_type == 'elan':
1179 self.add_portals_module("knals/qswnal", 'kqswnal')
1180 if self.net_type == 'gm':
1181 self.add_portals_module("knals/gmnal", 'kgmnal')
1183 def nid_to_uuid(self, nid):
1184 return "NID_%s_UUID" %(nid,)
1187 if not config.record and is_network_prepared():
1189 self.info(self.net_type, self.nid, self.port)
1190 if not (config.record and self.generic_nid):
1191 lctl.network(self.net_type, self.nid)
1192 if self.net_type == 'tcp':
1194 if self.net_type == 'elan':
1196 if self.port and node_is_router():
1197 run_one_acceptor(self.port)
1198 self.connect_peer_gateways()
1200 def connect_peer_gateways(self):
1201 for router in self.db.lookup_class('node'):
1202 if router.get_val_int('router', 0):
1203 for netuuid in router.get_networks():
1204 net = self.db.lookup(netuuid)
1206 if (gw.cluster_id == self.cluster_id and
1207 gw.net_type == self.net_type):
1208 if gw.nid != self.nid:
1211 def disconnect_peer_gateways(self):
1212 for router in self.db.lookup_class('node'):
1213 if router.get_val_int('router', 0):
1214 for netuuid in router.get_networks():
1215 net = self.db.lookup(netuuid)
1217 if (gw.cluster_id == self.cluster_id and
1218 gw.net_type == self.net_type):
1219 if gw.nid != self.nid:
1222 except CommandError, e:
1223 print "disconnect failed: ", self.name
1227 def safe_to_clean(self):
1228 return not is_network_prepared()
1231 self.info(self.net_type, self.nid, self.port)
1233 stop_acceptor(self.port)
1234 if node_is_router():
1235 self.disconnect_peer_gateways()
1237 def correct_level(self, level, op=None):
1240 class RouteTable(Module):
1241 def __init__(self,db):
1242 Module.__init__(self, 'ROUTES', db)
1244 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1246 # only setup connections for tcp NALs
1248 if not net_type in ('tcp',):
1251 # connect to target if route is to single node and this node is the gw
1252 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1253 if not local_cluster(net_type, tgt_cluster_id):
1254 panic("target", lo, " not on the local cluster")
1255 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1256 # connect to gateway if this node is not the gw
1257 elif (local_cluster(net_type, gw_cluster_id)
1258 and not local_interface(net_type, gw_cluster_id, gw)):
1259 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1264 panic("no server for nid", lo)
1267 return Network(srvdb)
1270 if not config.record and is_network_prepared():
1273 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1274 lctl.add_route(net_type, gw, lo, hi)
1275 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1279 def safe_to_clean(self):
1280 return not is_network_prepared()
1283 if is_network_prepared():
1284 # the network is still being used, don't clean it up
1286 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1287 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1290 lctl.disconnect(srv)
1291 except CommandError, e:
1292 print "disconnect failed: ", self.name
1297 lctl.del_route(net_type, gw, lo, hi)
1298 except CommandError, e:
1299 print "del_route failed: ", self.name
1303 class Management(Module):
1304 def __init__(self, db):
1305 Module.__init__(self, 'MGMT', db)
1306 self.add_lustre_module('lvfs', 'lvfs')
1307 self.add_lustre_module('obdclass', 'obdclass')
1308 self.add_lustre_module('ptlrpc', 'ptlrpc')
1309 self.add_lustre_module('mgmt', 'mgmt_svc')
1312 if not config.record and is_prepared(self.name):
1315 lctl.newdev("mgmt", self.name, self.uuid)
1317 def safe_to_clean(self):
1321 if is_prepared(self.name):
1322 Module.cleanup(self)
1324 def correct_level(self, level, op=None):
1327 # This is only needed to load the modules; the LDLM device
1328 # is now created automatically.
1330 def __init__(self,db):
1331 Module.__init__(self, 'LDLM', db)
1332 self.add_lustre_module('lvfs', 'lvfs')
1333 self.add_lustre_module('obdclass', 'obdclass')
1334 self.add_lustre_module('ptlrpc', 'ptlrpc')
1342 def correct_level(self, level, op=None):
1347 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1348 Module.__init__(self, 'LOV', db)
1349 if name_override != None:
1350 self.name = "lov_%s" % name_override
1351 self.add_lustre_module('lov', 'lov')
1352 self.mds_uuid = self.db.get_first_ref('mds')
1353 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1354 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1355 self.pattern = self.db.get_val_int('stripepattern', 0)
1356 self.devlist = self.db.get_lov_tgts('lov_tgt')
1357 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1359 self.desc_uuid = self.uuid
1360 self.uuid = generate_client_uuid(self.name)
1361 self.fs_name = fs_name
1363 self.config_only = 1
1365 self.config_only = None
1366 mds = self.db.lookup(self.mds_uuid)
1367 self.mds_name = mds.getName()
1368 for (obd_uuid, index, gen, active) in self.devlist:
1371 obd = self.db.lookup(obd_uuid)
1372 osc = get_osc(obd, self.uuid, fs_name)
1374 self.osclist.append((osc, index, gen, active))
1376 panic('osc not found:', obd_uuid)
1382 if not config.record and is_prepared(self.name):
1384 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1385 self.stripe_off, self.pattern, self.devlist,
1387 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1388 self.stripe_sz, self.stripe_off, self.pattern)
1389 for (osc, index, gen, active) in self.osclist:
1390 target_uuid = osc.target_uuid
1392 # Only ignore connect failures with --force, which
1393 # isn't implemented here yet.
1395 osc.prepare(ignore_connect_failure=0)
1396 except CommandError, e:
1397 print "Error preparing OSC %s\n" % osc.uuid
1399 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1402 for (osc, index, gen, active) in self.osclist:
1403 target_uuid = osc.target_uuid
1405 if is_prepared(self.name):
1406 Module.cleanup(self)
1407 if self.config_only:
1408 panic("Can't clean up config_only LOV ", self.name)
1410 def load_module(self):
1411 if self.config_only:
1412 panic("Can't load modules for config_only LOV ", self.name)
1413 for (osc, index, gen, active) in self.osclist:
1416 Module.load_module(self)
1418 def cleanup_module(self):
1419 if self.config_only:
1420 panic("Can't cleanup modules for config_only LOV ", self.name)
1421 Module.cleanup_module(self)
1422 for (osc, index, gen, active) in self.osclist:
1424 osc.cleanup_module()
1427 def correct_level(self, level, op=None):
1431 def __init__(self, db, uuid, fs_name, name_override = None):
1432 Module.__init__(self, 'LMV', db)
1433 if name_override != None:
1434 self.name = "lmv_%s" % name_override
1435 self.add_lustre_module('lmv', 'lmv')
1436 self.devlist = self.db.get_refs('mds')
1438 self.desc_uuid = self.uuid
1440 self.fs_name = fs_name
1441 for mds_uuid in self.devlist:
1442 mds = self.db.lookup(mds_uuid)
1444 panic("MDS not found!")
1445 mdc = MDC(mds, self.uuid, fs_name)
1447 self.mdclist.append(mdc)
1449 panic('mdc not found:', mds_uuid)
1452 if is_prepared(self.name):
1454 for mdc in self.mdclist:
1456 # Only ignore connect failures with --force, which
1457 # isn't implemented here yet.
1458 mdc.prepare(ignore_connect_failure=0)
1459 except CommandError, e:
1460 print "Error preparing LMV %s\n" % mdc.uuid
1462 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1463 string.join(self.devlist))
1466 for mdc in self.mdclist:
1468 if is_prepared(self.name):
1469 Module.cleanup(self)
1471 def load_module(self):
1472 for mdc in self.mdclist:
1475 Module.load_module(self)
1477 def cleanup_module(self):
1478 Module.cleanup_module(self)
1479 for mds in self.mdclist:
1480 mdc.cleanup_module()
1483 def correct_level(self, level, op=None):
1486 class MDSDEV(Module):
1487 def __init__(self,db):
1488 Module.__init__(self, 'MDSDEV', db)
1489 self.devpath = self.db.get_val('devpath','')
1490 self.backdevpath = self.db.get_val('backdevpath','')
1491 self.size = self.db.get_val_int('devsize', 0)
1492 self.journal_size = self.db.get_val_int('journalsize', 0)
1493 self.fstype = self.db.get_val('fstype', '')
1494 self.backfstype = self.db.get_val('backfstype', '')
1495 self.nspath = self.db.get_val('nspath', '')
1496 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1497 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1498 self.cachetype = self.db.get_val('cachetype', '')
1499 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1500 target_uuid = self.db.get_first_ref('target')
1501 mds = self.db.lookup(target_uuid)
1502 self.name = mds.getName()
1503 self.filesystem_uuids = mds.get_refs('filesystem')
1506 self.master_mds = ""
1507 if not self.filesystem_uuids:
1508 self.lmv_uuid = self.db.get_first_ref('lmv')
1509 if not self.lmv_uuid:
1510 panic("ALERT: can't find lvm uuid")
1512 self.lmv = self.db.lookup(self.lmv_uuid)
1514 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1515 self.master_mds = self.lmv_uuid
1516 # FIXME: if fstype not set, then determine based on kernel version
1517 self.format = self.db.get_val('autoformat', "no")
1518 if mds.get_val('failover', 0):
1519 self.failover_mds = 'f'
1521 self.failover_mds = 'n'
1522 active_uuid = get_active_target(mds)
1524 panic("No target device found:", target_uuid)
1525 if active_uuid == self.uuid:
1529 if self.active and config.group and config.group != mds.get_val('group'):
1532 self.inode_size = self.db.get_val_int('inodesize', 0)
1533 if self.inode_size == 0:
1534 # find the LOV for this MDS
1535 lovconfig_uuid = mds.get_first_ref('lovconfig')
1536 if not lovconfig_uuid:
1537 if not self.lmv_uuid:
1538 panic("No LOV found for lovconfig ", lovconfig.name)
1541 panic("No LMV initialized and not lovconfig_uuid found")
1543 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1544 lovconfig = self.lmv.lookup(lovconfig_uuid)
1545 lov_uuid = lovconfig.get_first_ref('lov')
1547 panic("No LOV found for lovconfig ", lovconfig.name)
1549 lovconfig = mds.lookup(lovconfig_uuid)
1550 lov_uuid = lovconfig.get_first_ref('lov')
1552 panic("No LOV found for lovconfig ", lovconfig.name)
1555 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1556 lovconfig = self.lmv.lookup(lovconfig_uuid)
1557 lov_uuid = lovconfig.get_first_ref('lov')
1559 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1561 # default stripe count controls default inode_size
1562 stripe_count = lov.stripe_cnt
1563 if stripe_count > 77:
1564 self.inode_size = 4096
1565 elif stripe_count > 35:
1566 self.inode_size = 2048
1567 elif stripe_count > 13:
1568 self.inode_size = 1024
1569 elif stripe_count > 3:
1570 self.inode_size = 512
1572 self.inode_size = 256
1574 self.target_dev_uuid = self.uuid
1575 self.uuid = target_uuid
1578 client_uuid = generate_client_uuid(self.name)
1579 client_uuid = self.name + "_lmv_" + "UUID"
1580 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1581 self.master_mds = self.master.name
1584 self.add_lustre_module('mdc', 'mdc')
1585 self.add_lustre_module('osc', 'osc')
1586 self.add_lustre_module('lov', 'lov')
1587 self.add_lustre_module('lmv', 'lmv')
1588 self.add_lustre_module('ost', 'ost')
1589 self.add_lustre_module('mds', 'mds')
1591 if self.fstype == 'smfs':
1592 self.add_lustre_module('smfs', 'smfs')
1594 if self.fstype == 'ldiskfs':
1595 self.add_lustre_module('ldiskfs', 'ldiskfs')
1598 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1600 # if fstype is smfs, then we should also take care about backing
1602 if self.fstype == 'smfs':
1603 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1605 for options in string.split(self.mountfsoptions, ','):
1606 if options == 'snap':
1607 if not self.fstype == 'smfs':
1608 panic("mountoptions with snap, but fstype is not smfs\n")
1609 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1610 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1611 def load_module(self):
1613 Module.load_module(self)
1616 if not config.record and is_prepared(self.name):
1619 debug(self.uuid, "not active")
1622 # run write_conf automatically, if --reformat used
1624 self.info(self.devpath, self.fstype, self.size, self.format)
1628 self.master.prepare()
1629 # never reformat here
1630 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1631 self.format, self.journal_size, self.inode_size,
1632 self.mkfsoptions, self.backfstype, self.backdevpath)
1634 if not is_prepared('MDT'):
1635 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1637 mountfsoptions = def_mount_options(self.fstype, 'mds')
1639 if config.mountfsoptions:
1641 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1643 mountfsoptions = config.mountfsoptions
1644 if self.mountfsoptions:
1645 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1647 if self.mountfsoptions:
1649 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1651 mountfsoptions = self.mountfsoptions
1653 if self.fstype == 'smfs':
1654 realdev = self.fstype
1657 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1661 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1666 print 'MDS mount options: ' + mountfsoptions
1668 if not self.master_mds:
1669 self.master_mds = 'dumb'
1670 if not self.cachetype:
1671 self.cachetype = 'dumb'
1672 lctl.newdev("mds", self.name, self.uuid,
1673 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1674 self.name, mountfsoptions,
1675 self.master_mds, self.cachetype))
1676 except CommandError, e:
1678 panic("MDS is missing the config log. Need to run " +
1679 "lconf --write_conf.")
1683 def write_conf(self):
1685 if not is_prepared(self.name):
1686 self.info(self.devpath, self.fstype, self.format)
1688 blkdev = block_dev(self.devpath, self.size, self.fstype,
1689 config.reformat, self.format, self.journal_size,
1690 self.inode_size, self.mkfsoptions,
1691 self.backfstype, self.backdevpath)
1693 # Even for writing logs we mount mds with supplied mount options
1694 # because it will not mount smfs (if used) otherwise.
1696 mountfsoptions = def_mount_options(self.fstype, 'mds')
1698 if config.mountfsoptions:
1700 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1702 mountfsoptions = config.mountfsoptions
1703 if self.mountfsoptions:
1704 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1706 if self.mountfsoptions:
1708 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1710 mountfsoptions = self.mountfsoptions
1712 if self.fstype == 'smfs':
1713 realdev = self.fstype
1716 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1720 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1725 print 'MDS mount options: ' + mountfsoptions
1727 # As mount options are passed by 4th param to config tool, we need
1728 # to pass something in 3rd param. But we do not want this 3rd param
1729 # be counted as a profile name for reading log on MDS setup, thus,
1730 # we pass there some predefined sign like 'dumb', which will be
1731 # checked in MDS code and skipped. Probably there is more nice way
1732 # like pass empty string and check it in config tool and pass null
1734 lctl.newdev("mds", self.name, self.uuid,
1735 setup ="%s %s %s %s" %(realdev, self.fstype,
1736 'dumb', mountfsoptions))
1739 # record logs for the MDS lov
1740 for uuid in self.filesystem_uuids:
1741 log("recording clients for filesystem:", uuid)
1742 fs = self.db.lookup(uuid)
1744 # this is ugly, should be organized nice later.
1745 target_uuid = self.db.get_first_ref('target')
1746 mds = self.db.lookup(target_uuid)
1748 lovconfig_uuid = mds.get_first_ref('lovconfig')
1750 lovconfig = mds.lookup(lovconfig_uuid)
1751 obd_uuid = lovconfig.get_first_ref('lov')
1753 obd_uuid = fs.get_first_ref('obd')
1755 client_uuid = generate_client_uuid(self.name)
1756 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1759 lctl.clear_log(self.name, self.name)
1760 lctl.record(self.name, self.name)
1762 lctl.mount_option(self.name, client.get_name(), "")
1764 process_updates(self.db, self.name, self.name, client)
1767 lctl.clear_log(self.name, self.name + '-clean')
1768 lctl.record(self.name, self.name + '-clean')
1770 lctl.del_mount_option(self.name)
1772 process_updates(self.db, self.name, self.name + '-clean', client)
1776 # record logs for each client
1782 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1784 config_options = CONFIG_FILE
1786 for node_db in self.db.lookup_class('node'):
1787 client_name = node_db.getName()
1788 for prof_uuid in node_db.get_refs('profile'):
1789 prof_db = node_db.lookup(prof_uuid)
1790 # refactor this into a funtion to test "clientness"
1792 for ref_class, ref_uuid in prof_db.get_all_refs():
1793 if ref_class in ('mountpoint','echoclient'):
1794 debug("recording", client_name)
1795 old_noexec = config.noexec
1797 ret, out = run (sys.argv[0], noexec_opt,
1798 " -v --record --nomod",
1799 "--record_log", client_name,
1800 "--record_device", self.name,
1801 "--node", client_name,
1804 for s in out: log("record> ", string.strip(s))
1805 ret, out = run (sys.argv[0], noexec_opt,
1806 "--cleanup -v --record --nomod",
1807 "--record_log", client_name + "-clean",
1808 "--record_device", self.name,
1809 "--node", client_name,
1812 for s in out: log("record> ", string.strip(s))
1813 config.noexec = old_noexec
1816 lctl.cleanup(self.name, self.uuid, 0, 0)
1817 except CommandError, e:
1818 log(self.module_name, "cleanup failed: ", self.name)
1821 Module.cleanup(self)
1823 if self.fstype == 'smfs':
1824 clean_loop(self.backdevpath)
1826 clean_loop(self.devpath)
1828 def msd_remaining(self):
1829 out = lctl.device_list()
1831 if string.split(s)[2] in ('mds',):
1834 def safe_to_clean(self):
1837 def safe_to_clean_modules(self):
1838 return not self.msd_remaining()
1842 debug(self.uuid, "not active")
1845 if is_prepared(self.name):
1847 lctl.cleanup(self.name, self.uuid, config.force,
1849 except CommandError, e:
1850 log(self.module_name, "cleanup failed: ", self.name)
1853 Module.cleanup(self)
1856 self.master.cleanup()
1857 if not self.msd_remaining() and is_prepared('MDT'):
1859 lctl.cleanup("MDT", "MDT_UUID", config.force,
1861 except CommandError, e:
1862 print "cleanup failed: ", self.name
1866 if self.fstype == 'smfs':
1867 clean_loop(self.backdevpath)
1869 clean_loop(self.devpath)
1871 def correct_level(self, level, op=None):
1872 #if self.master_mds:
1877 def __init__(self, db):
1878 Module.__init__(self, 'OSD', db)
1879 self.osdtype = self.db.get_val('osdtype')
1880 self.devpath = self.db.get_val('devpath', '')
1881 self.backdevpath = self.db.get_val('backdevpath', '')
1882 self.size = self.db.get_val_int('devsize', 0)
1883 self.journal_size = self.db.get_val_int('journalsize', 0)
1884 self.inode_size = self.db.get_val_int('inodesize', 0)
1885 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1886 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1887 self.fstype = self.db.get_val('fstype', '')
1888 self.backfstype = self.db.get_val('backfstype', '')
1889 self.nspath = self.db.get_val('nspath', '')
1890 target_uuid = self.db.get_first_ref('target')
1891 ost = self.db.lookup(target_uuid)
1892 self.name = ost.getName()
1893 self.format = self.db.get_val('autoformat', 'yes')
1894 if ost.get_val('failover', 0):
1895 self.failover_ost = 'f'
1897 self.failover_ost = 'n'
1899 active_uuid = get_active_target(ost)
1901 panic("No target device found:", target_uuid)
1902 if active_uuid == self.uuid:
1906 if self.active and config.group and config.group != ost.get_val('group'):
1909 self.target_dev_uuid = self.uuid
1910 self.uuid = target_uuid
1912 self.add_lustre_module('ost', 'ost')
1913 if self.fstype == 'smfs':
1914 self.add_lustre_module('smfs', 'smfs')
1915 # FIXME: should we default to ext3 here?
1916 if self.fstype == 'ldiskfs':
1917 self.add_lustre_module('ldiskfs', 'ldiskfs')
1919 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1920 if self.fstype == 'smfs':
1921 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1923 for options in self.mountfsoptions:
1924 if options == 'snap':
1925 if not self.fstype == 'smfs':
1926 panic("mountoptions with snap, but fstype is not smfs\n")
1927 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1928 self.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1930 self.add_lustre_module(self.osdtype, self.osdtype)
1932 def load_module(self):
1934 Module.load_module(self)
1936 # need to check /proc/mounts and /etc/mtab before
1937 # formatting anything.
1938 # FIXME: check if device is already formatted.
1940 if is_prepared(self.name):
1943 debug(self.uuid, "not active")
1945 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1946 self.format, self.journal_size, self.inode_size)
1948 if self.osdtype == 'obdecho':
1951 blkdev = block_dev(self.devpath, self.size, self.fstype,
1952 config.reformat, self.format, self.journal_size,
1953 self.inode_size, self.mkfsoptions, self.backfstype,
1956 mountfsoptions = def_mount_options(self.fstype, 'ost')
1958 if config.mountfsoptions:
1960 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1962 mountfsoptions = config.mountfsoptions
1963 if self.mountfsoptions:
1964 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1966 if self.mountfsoptions:
1968 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1970 mountfsoptions = self.mountfsoptions
1972 if self.fstype == 'smfs':
1973 realdev = self.fstype
1976 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1980 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1985 print 'OSD mount options: ' + mountfsoptions
1987 lctl.newdev(self.osdtype, self.name, self.uuid,
1988 setup ="%s %s %s %s" %(realdev, self.fstype,
1991 if not is_prepared('OSS'):
1992 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1994 def osd_remaining(self):
1995 out = lctl.device_list()
1997 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2000 def safe_to_clean(self):
2003 def safe_to_clean_modules(self):
2004 return not self.osd_remaining()
2008 debug(self.uuid, "not active")
2010 if is_prepared(self.name):
2013 lctl.cleanup(self.name, self.uuid, config.force,
2015 except CommandError, e:
2016 log(self.module_name, "cleanup failed: ", self.name)
2019 if not self.osd_remaining() and is_prepared('OSS'):
2021 lctl.cleanup("OSS", "OSS_UUID", config.force,
2023 except CommandError, e:
2024 print "cleanup failed: ", self.name
2027 if not self.osdtype == 'obdecho':
2028 if self.fstype == 'smfs':
2029 clean_loop(self.backdevpath)
2031 clean_loop(self.devpath)
2033 def correct_level(self, level, op=None):
2036 def mgmt_uuid_for_fs(mtpt_name):
2039 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2040 fs_uuid = mtpt_db.get_first_ref('filesystem')
2041 fs = toplustreDB.lookup(fs_uuid)
2044 return fs.get_first_ref('mgmt')
2046 # Generic client module, used by OSC and MDC
2047 class Client(Module):
2048 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2050 self.target_name = tgtdb.getName()
2051 self.target_uuid = tgtdb.getUUID()
2055 self.tgt_dev_uuid = get_active_target(tgtdb)
2056 if not self.tgt_dev_uuid:
2057 panic("No target device found for target(1):", self.target_name)
2059 self.kmod = kmod(config.lustre, config.portals)
2063 self.module = module
2064 self.module_name = string.upper(module)
2066 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2067 self.target_name, fs_name)
2069 self.name = self_name
2071 self.lookup_server(self.tgt_dev_uuid)
2072 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2074 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2077 self.fs_name = fs_name
2080 self.add_lustre_module(module_dir, module)
2082 def lookup_server(self, srv_uuid):
2083 """ Lookup a server's network information """
2084 self._server_nets = get_ost_net(self.db, srv_uuid)
2085 if len(self._server_nets) == 0:
2086 panic ("Unable to find a server for:", srv_uuid)
2089 def get_servers(self):
2090 return self._server_nets
2092 def prepare(self, ignore_connect_failure = 0):
2093 self.info(self.target_uuid)
2094 if not config.record and is_prepared(self.name):
2097 srv = choose_local_server(self.get_servers())
2101 routes = find_route(self.get_servers())
2102 if len(routes) == 0:
2103 panic ("no route to", self.target_uuid)
2104 for (srv, r) in routes:
2105 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2106 except CommandError, e:
2107 if not ignore_connect_failure:
2110 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2111 debug("%s inactive" % self.target_uuid)
2112 inactive_p = "inactive"
2114 debug("%s active" % self.target_uuid)
2116 lctl.newdev(self.module, self.name, self.uuid,
2117 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2118 inactive_p, self.mgmt_name))
2121 if is_prepared(self.name):
2122 Module.cleanup(self)
2124 srv = choose_local_server(self.get_servers())
2126 lctl.disconnect(srv)
2128 for (srv, r) in find_route(self.get_servers()):
2129 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2130 except CommandError, e:
2131 log(self.module_name, "cleanup failed: ", self.name)
2135 def correct_level(self, level, op=None):
2138 def deactivate(self):
2140 lctl.deactivate(self.name)
2141 except CommandError, e:
2142 log(self.module_name, "deactivate failed: ", self.name)
2147 def __init__(self, db, uuid, fs_name):
2148 Client.__init__(self, db, uuid, 'mdc', fs_name)
2150 def permits_inactive(self):
2154 def __init__(self, db, uuid, fs_name):
2155 Client.__init__(self, db, uuid, 'osc', fs_name)
2157 def permits_inactive(self):
2160 def mgmtcli_name_for_uuid(uuid):
2161 return 'MGMTCLI_%s' % uuid
2163 class ManagementClient(Client):
2164 def __init__(self, db, uuid):
2165 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2166 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2167 module_dir = 'mgmt')
2169 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
2170 Module.__init__(self, 'VLOV', db)
2171 if name_override != None:
2172 self.name = "lov_%s" % name_override
2173 self.add_lustre_module('lov', 'lov')
2174 self.stripe_sz = 65536
2178 self.desc_uuid = self.uuid
2179 self.uuid = generate_client_uuid(self.name)
2180 self.fs_name = fs_name
2181 self.osc = get_osc(db, self.uuid, fs_name)
2183 panic('osc not found:', self.uuid)
2185 self.config_only = 1
2187 self.config_only = None
2193 if not config.record and is_prepared(self.name):
2195 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
2196 self.stripe_sz, self.stripe_off, self.pattern)
2197 target_uuid = self.osc.target_uuid
2200 self.osc.prepare(ignore_connect_failure=0)
2201 except CommandError, e:
2202 print "Error preparing OSC %s\n" % osc.uuid
2204 lctl.lov_add_obd(self.name, self.uuid, target_uuid, 0, 1)
2207 target_uuid = self.osc.target_uuid
2209 if is_prepared(self.name):
2210 Module.cleanup(self)
2211 if self.config_only:
2212 panic("Can't clean up config_only LOV ", self.name)
2214 def load_module(self):
2215 if self.config_only:
2216 panic("Can't load modules for config_only LOV ", self.name)
2217 self.osc.load_module()
2218 Module.load_module(self)
2220 def cleanup_module(self):
2221 if self.config_only:
2222 panic("Can't cleanup modules for config_only LOV ", self.name)
2223 Module.cleanup_module(self)
2224 self.osc.cleanup_module()
2226 def correct_level(self, level, op=None):
2229 class CMOBD(Module):
2230 def __init__(self,db):
2231 Module.__init__(self, 'CMOBD', db)
2232 self.name = self.db.getName();
2233 self.uuid = generate_client_uuid(self.name)
2234 self.master_uuid = self.db.get_first_ref('masterobd')
2235 self.cache_uuid = self.db.get_first_ref('cacheobd')
2236 self.add_lustre_module('cmobd', 'cmobd')
2237 master_obd = self.db.lookup(self.master_uuid)
2239 panic('master obd not found:', self.master_uuid)
2240 cache_obd = self.db.lookup(self.cache_uuid)
2242 panic('cache obd not found:', self.cache_uuid)
2244 if master_obd.get_class() == 'ost':
2245 self.client_uuid = generate_client_uuid(self.name)
2246 self.master= VLOV(master_obd, self.client_uuid, self.name,
2247 "%s_master" % (self.name))
2248 self.master_uuid = self.master.get_uuid()
2250 self.master = get_mdc(db, self.name, self.master_uuid)
2251 # need to check /proc/mounts and /etc/mtab before
2252 # formatting anything.
2253 # FIXME: check if device is already formatted.
2255 self.master.prepare()
2256 if not config.record and is_prepared(self.name):
2258 self.info(self.master_uuid, self.cache_uuid)
2259 lctl.newdev("cmobd", self.name, self.uuid,
2260 setup ="%s %s" %(self.master_uuid,
2264 if is_prepared(self.name):
2265 Module.cleanup(self)
2266 self.master.cleanup()
2268 def load_module(self):
2269 self.master.load_module()
2270 Module.load_module(self)
2272 def cleanup_module(self):
2273 Module.cleanup_module(self)
2274 self.master.cleanup_module()
2276 def correct_level(self, level, op=None):
2280 def __init__(self, db, uuid, name, type, name_override = None):
2281 Module.__init__(self, 'COBD', db)
2282 self.name = self.db.getName();
2283 self.uuid = generate_client_uuid(self.name)
2284 self.real_uuid = self.db.get_first_ref('realobd')
2285 self.cache_uuid = self.db.get_first_ref('cacheobd')
2286 self.add_lustre_module('cobd', 'cobd')
2287 real_obd = self.db.lookup(self.real_uuid)
2289 panic('real obd not found:', self.real_uuid)
2290 cache_obd = self.db.lookup(self.cache_uuid)
2292 panic('cache obd not found:', self.cache_uuid)
2294 self.real = LOV(real_obd, self.real_uuid, name,
2295 "%s_real" % (self.name));
2296 self.cache = LOV(cache_obd, self.cache_uuid, name,
2297 "%s_cache" % (self.name));
2299 self.real = get_mdc(db, name, self.real_uuid)
2300 self.cache = get_mdc(db, name, self.cache_uuid)
2301 # need to check /proc/mounts and /etc/mtab before
2302 # formatting anything.
2303 # FIXME: check if device is already formatted.
2308 def get_real_name(self):
2309 return self.real.name
2310 def get_cache_name(self):
2311 return self.cache.name
2314 self.cache.prepare()
2315 if not config.record and is_prepared(self.name):
2317 self.info(self.real_uuid, self.cache_uuid)
2318 lctl.newdev("cobd", self.name, self.uuid,
2319 setup ="%s %s" %(self.real.name,
2323 if is_prepared(self.name):
2324 Module.cleanup(self)
2326 self.cache.cleanup()
2328 def load_module(self):
2329 self.real.load_module()
2330 Module.load_module(self)
2332 def cleanup_module(self):
2333 Module.cleanup_module(self)
2334 self.real.cleanup_module()
2336 # virtual interface for OSC and LOV
2338 def __init__(self, db, client_uuid, name, name_override = None):
2339 Module.__init__(self, 'VOSC', db)
2340 if db.get_class() == 'lov':
2341 self.osc = LOV(db, client_uuid, name, name_override)
2343 elif db.get_class() == 'cobd':
2344 self.osc = COBD(db, client_uuid, name, 'obd')
2347 self.osc = OSC(db, client_uuid, name)
2350 return self.osc.get_uuid()
2352 return self.osc.get_name()
2357 def load_module(self):
2358 self.osc.load_module()
2359 def cleanup_module(self):
2360 self.osc.cleanup_module()
2361 def correct_level(self, level, op=None):
2362 return self.osc.correct_level(level, op)
2364 # virtual interface for MDC and LMV
2366 def __init__(self, db, client_uuid, name, name_override = None):
2367 Module.__init__(self, 'VMDC', db)
2368 if db.get_class() == 'lmv':
2369 self.mdc = LMV(db, client_uuid, name)
2370 elif db.get_class() == 'cobd':
2371 self.mdc = COBD(db, client_uuid, name, 'mds')
2373 self.mdc = MDC(db, client_uuid, name)
2375 return self.mdc.uuid
2377 return self.mdc.name
2382 def load_module(self):
2383 self.mdc.load_module()
2384 def cleanup_module(self):
2385 self.mdc.cleanup_module()
2386 def correct_level(self, level, op=None):
2387 return self.mdc.correct_level(level, op)
2389 class ECHO_CLIENT(Module):
2390 def __init__(self,db):
2391 Module.__init__(self, 'ECHO_CLIENT', db)
2392 self.add_lustre_module('obdecho', 'obdecho')
2393 self.obd_uuid = self.db.get_first_ref('obd')
2394 obd = self.db.lookup(self.obd_uuid)
2395 self.uuid = generate_client_uuid(self.name)
2396 self.osc = VOSC(obd, self.uuid, self.name)
2399 if not config.record and is_prepared(self.name):
2402 self.osc.prepare() # XXX This is so cheating. -p
2403 self.info(self.obd_uuid)
2405 lctl.newdev("echo_client", self.name, self.uuid,
2406 setup = self.osc.get_name())
2409 if is_prepared(self.name):
2410 Module.cleanup(self)
2413 def load_module(self):
2414 self.osc.load_module()
2415 Module.load_module(self)
2417 def cleanup_module(self):
2418 Module.cleanup_module(self)
2419 self.osc.cleanup_module()
2421 def correct_level(self, level, op=None):
2424 def generate_client_uuid(name):
2425 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2427 int(random.random() * 1048576),
2428 int(random.random() * 1048576))
2429 return client_uuid[:36]
2431 class Mountpoint(Module):
2432 def __init__(self,db):
2433 Module.__init__(self, 'MTPT', db)
2434 self.path = self.db.get_val('path')
2435 self.fs_uuid = self.db.get_first_ref('filesystem')
2436 fs = self.db.lookup(self.fs_uuid)
2437 self.mds_uuid = fs.get_first_ref('lmv')
2438 if not self.mds_uuid:
2439 self.mds_uuid = fs.get_first_ref('mds')
2440 self.obd_uuid = fs.get_first_ref('obd')
2441 self.mgmt_uuid = fs.get_first_ref('mgmt')
2442 client_uuid = generate_client_uuid(self.name)
2444 ost = self.db.lookup(self.obd_uuid)
2446 panic("no ost: ", self.obd_uuid)
2448 mds = self.db.lookup(self.mds_uuid)
2450 panic("no mds: ", self.mds_uuid)
2452 self.add_lustre_module('mdc', 'mdc')
2453 self.add_lustre_module('lmv', 'lmv')
2454 self.add_lustre_module('llite', 'llite')
2456 self.vosc = VOSC(ost, client_uuid, self.name)
2457 self.vmdc = VMDC(mds, client_uuid, self.name)
2460 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2466 if not config.record and fs_is_mounted(self.path):
2467 log(self.path, "already mounted.")
2471 self.mgmtcli.prepare()
2474 vmdc_name = self.vmdc.get_name()
2476 self.info(self.path, self.mds_uuid, self.obd_uuid)
2477 if config.record or config.lctl_dump:
2478 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2480 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2481 (self.vosc.get_name(), vmdc_name, config.config, self.path)
2482 run("mkdir", self.path)
2487 panic("mount failed:", self.path, ":", string.join(val))
2490 self.info(self.path, self.mds_uuid,self.obd_uuid)
2492 if config.record or config.lctl_dump:
2493 lctl.del_mount_option(local_node_name)
2495 if fs_is_mounted(self.path):
2497 (rc, out) = run("umount", "-f", self.path)
2499 (rc, out) = run("umount", self.path)
2501 raise CommandError('umount', out, rc)
2503 if fs_is_mounted(self.path):
2504 panic("fs is still mounted:", self.path)
2509 self.mgmtcli.cleanup()
2511 def load_module(self):
2513 self.mgmtcli.load_module()
2514 self.vosc.load_module()
2515 Module.load_module(self)
2517 def cleanup_module(self):
2518 Module.cleanup_module(self)
2519 self.vosc.cleanup_module()
2521 self.mgmtcli.cleanup_module()
2523 def correct_level(self, level, op=None):
2526 # ============================================================
2527 # misc query functions
2529 def get_ost_net(self, osd_uuid):
2533 osd = self.lookup(osd_uuid)
2534 node_uuid = osd.get_first_ref('node')
2535 node = self.lookup(node_uuid)
2537 panic("unable to find node for osd_uuid:", osd_uuid,
2538 " node_ref:", node_uuid_)
2539 for net_uuid in node.get_networks():
2540 db = node.lookup(net_uuid)
2541 srv_list.append(Network(db))
2545 # the order of iniitailization is based on level.
2546 def getServiceLevel(self):
2547 type = self.get_class()
2549 if type in ('network',):
2551 elif type in ('routetbl',):
2553 elif type in ('ldlm',):
2555 elif type in ('mgmt',):
2557 elif type in ('osd', 'cobd'):
2559 elif type in ('mdsdev',):
2561 elif type in ('lmv',):
2563 elif type in ('cmobd',):
2565 elif type in ('mountpoint', 'echoclient'):
2568 panic("Unknown type: ", type)
2570 if ret < config.minlevel or ret > config.maxlevel:
2575 # return list of services in a profile. list is a list of tuples
2576 # [(level, db_object),]
2577 def getServices(self):
2579 for ref_class, ref_uuid in self.get_all_refs():
2580 servdb = self.lookup(ref_uuid)
2582 level = getServiceLevel(servdb)
2584 list.append((level, servdb))
2586 panic('service not found: ' + ref_uuid)
2592 ############################################################
2594 # FIXME: clean this mess up!
2596 # OSC is no longer in the xml, so we have to fake it.
2597 # this is getting ugly and begging for another refactoring
2598 def get_osc(ost_db, uuid, fs_name):
2599 osc = OSC(ost_db, uuid, fs_name)
2602 def get_mdc(db, fs_name, mds_uuid):
2603 mds_db = db.lookup(mds_uuid);
2605 error("no mds:", mds_uuid)
2606 mdc = MDC(mds_db, mds_uuid, fs_name)
2609 ############################################################
2610 # routing ("rooting")
2611 # list of (nettype, cluster_id, nid)
2614 def find_local_clusters(node_db):
2615 global local_clusters
2616 for netuuid in node_db.get_networks():
2617 net = node_db.lookup(netuuid)
2619 debug("add_local", netuuid)
2620 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2622 if acceptors.has_key(srv.port):
2623 panic("duplicate port:", srv.port)
2624 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2625 srv.send_mem, srv.recv_mem,
2628 # This node is a gateway.
2630 def node_is_router():
2633 # If there are any routers found in the config, then this will be true
2634 # and all nodes will load kptlrouter.
2636 def node_needs_router():
2637 return needs_router or is_router
2639 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2640 # Currently, these local routes are only added to kptlrouter route
2641 # table if they are needed to connect to a specific server. This
2642 # should be changed so all available routes are loaded, and the
2643 # ptlrouter can make all the decisions.
2646 def find_local_routes(lustre):
2647 """ Scan the lustre config looking for routers . Build list of
2649 global local_routes, needs_router
2651 list = lustre.lookup_class('node')
2653 if router.get_val_int('router', 0):
2655 for (local_type, local_cluster_id, local_nid) in local_clusters:
2657 for netuuid in router.get_networks():
2658 db = router.lookup(netuuid)
2659 if (local_type == db.get_val('nettype') and
2660 local_cluster_id == db.get_val('clusterid')):
2661 gw = db.get_val('nid')
2664 debug("find_local_routes: gw is", gw)
2665 for route in router.get_local_routes(local_type, gw):
2666 local_routes.append(route)
2667 debug("find_local_routes:", local_routes)
2670 def choose_local_server(srv_list):
2671 for srv in srv_list:
2672 if local_cluster(srv.net_type, srv.cluster_id):
2675 def local_cluster(net_type, cluster_id):
2676 for cluster in local_clusters:
2677 if net_type == cluster[0] and cluster_id == cluster[1]:
2681 def local_interface(net_type, cluster_id, nid):
2682 for cluster in local_clusters:
2683 if (net_type == cluster[0] and cluster_id == cluster[1]
2684 and nid == cluster[2]):
2688 def find_route(srv_list):
2690 frm_type = local_clusters[0][0]
2691 for srv in srv_list:
2692 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2693 to_type = srv.net_type
2695 cluster_id = srv.cluster_id
2696 debug ('looking for route to', to_type, to)
2697 for r in local_routes:
2698 debug("find_route: ", r)
2699 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2700 result.append((srv, r))
2703 def get_active_target(db):
2704 target_uuid = db.getUUID()
2705 target_name = db.getName()
2706 node_name = get_select(target_name)
2708 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2710 tgt_dev_uuid = db.get_first_ref('active')
2713 def get_server_by_nid_uuid(db, nid_uuid):
2714 for n in db.lookup_class("network"):
2716 if net.nid_uuid == nid_uuid:
2720 ############################################################
2724 type = db.get_class()
2725 debug('Service:', type, db.getName(), db.getUUID())
2730 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2731 elif type == 'network':
2733 elif type == 'routetbl':
2737 elif type == 'cobd':
2738 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2739 elif type == 'cmobd':
2741 elif type == 'mdsdev':
2743 elif type == 'mountpoint':
2745 elif type == 'echoclient':
2747 elif type == 'mgmt':
2752 panic ("unknown service type:", type)
2756 # Prepare the system to run lustre using a particular profile
2757 # in a the configuration.
2758 # * load & the modules
2759 # * setup networking for the current node
2760 # * make sure partitions are in place and prepared
2761 # * initialize devices with lctl
2762 # Levels is important, and needs to be enforced.
2763 def for_each_profile(db, prof_list, operation):
2764 for prof_uuid in prof_list:
2765 prof_db = db.lookup(prof_uuid)
2767 panic("profile:", profile, "not found.")
2768 services = getServices(prof_db)
2771 def magic_get_osc(db, rec, lov):
2773 lov_uuid = lov.get_uuid()
2774 lov_name = lov.osc.fs_name
2776 lov_uuid = rec.getAttribute('lov_uuidref')
2777 # FIXME: better way to find the mountpoint?
2778 filesystems = db.root_node.getElementsByTagName('filesystem')
2780 for fs in filesystems:
2781 ref = fs.getElementsByTagName('obd_ref')
2782 if ref[0].getAttribute('uuidref') == lov_uuid:
2783 fsuuid = fs.getAttribute('uuid')
2787 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2789 mtpts = db.root_node.getElementsByTagName('mountpoint')
2792 ref = fs.getElementsByTagName('filesystem_ref')
2793 if ref[0].getAttribute('uuidref') == fsuuid:
2794 lov_name = fs.getAttribute('name')
2798 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2800 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2802 ost_uuid = rec.getAttribute('ost_uuidref')
2803 obd = db.lookup(ost_uuid)
2806 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
2808 osc = get_osc(obd, lov_uuid, lov_name)
2810 panic('osc not found:', obd_uuid)
2813 # write logs for update records. sadly, logs of all types -- and updates in
2814 # particular -- are something of an afterthought. lconf needs rewritten with
2815 # these as core concepts. so this is a pretty big hack.
2816 def process_update_record(db, update, lov):
2817 for rec in update.childNodes:
2818 if rec.nodeType != rec.ELEMENT_NODE:
2821 log("found "+rec.nodeName+" record in update version " +
2822 str(update.getAttribute('version')))
2824 lov_uuid = rec.getAttribute('lov_uuidref')
2825 ost_uuid = rec.getAttribute('ost_uuidref')
2826 index = rec.getAttribute('index')
2827 gen = rec.getAttribute('generation')
2829 if not lov_uuid or not ost_uuid or not index or not gen:
2830 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
2833 tmplov = db.lookup(lov_uuid)
2835 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
2836 lov_name = tmplov.getName()
2838 lov_name = lov.osc.name
2840 # ------------------------------------------------------------- add
2841 if rec.nodeName == 'add':
2843 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2846 osc = magic_get_osc(db, rec, lov)
2849 # Only ignore connect failures with --force, which
2850 # isn't implemented here yet.
2851 osc.prepare(ignore_connect_failure=0)
2852 except CommandError, e:
2853 print "Error preparing OSC %s\n" % osc.uuid
2856 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2858 # ------------------------------------------------------ deactivate
2859 elif rec.nodeName == 'deactivate':
2863 osc = magic_get_osc(db, rec, lov)
2867 except CommandError, e:
2868 print "Error deactivating OSC %s\n" % osc.uuid
2871 # ---------------------------------------------------------- delete
2872 elif rec.nodeName == 'delete':
2876 osc = magic_get_osc(db, rec, lov)
2882 except CommandError, e:
2883 print "Error cleaning up OSC %s\n" % osc.uuid
2886 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2888 def process_updates(db, log_device, log_name, lov = None):
2889 updates = db.root_node.getElementsByTagName('update')
2891 if not u.childNodes:
2892 log("ignoring empty update record (version " +
2893 str(u.getAttribute('version')) + ")")
2896 version = u.getAttribute('version')
2897 real_name = "%s-%s" % (log_name, version)
2898 lctl.clear_log(log_device, real_name)
2899 lctl.record(log_device, real_name)
2901 process_update_record(db, u, lov)
2905 def doWriteconf(services):
2909 if s[1].get_class() == 'mdsdev':
2910 n = newService(s[1])
2913 def doSetup(services):
2918 n = newService(s[1])
2920 slist.append((n.level, n))
2923 nl = n[1].correct_level(n[0])
2924 nlist.append((nl, n[1]))
2929 def doModules(services):
2933 n = newService(s[1])
2936 def doCleanup(services):
2941 n = newService(s[1])
2943 slist.append((n.level, n))
2946 nl = n[1].correct_level(n[0])
2947 nlist.append((nl, n[1]))
2951 if n[1].safe_to_clean():
2954 def doUnloadModules(services):
2959 n = newService(s[1])
2960 if n.safe_to_clean_modules():
2965 def doHost(lustreDB, hosts):
2966 global is_router, local_node_name
2969 node_db = lustreDB.lookup_name(h, 'node')
2973 panic('No host entry found.')
2975 local_node_name = node_db.get_val('name', 0)
2976 is_router = node_db.get_val_int('router', 0)
2977 lustre_upcall = node_db.get_val('lustreUpcall', '')
2978 portals_upcall = node_db.get_val('portalsUpcall', '')
2979 timeout = node_db.get_val_int('timeout', 0)
2980 ptldebug = node_db.get_val('ptldebug', '')
2981 subsystem = node_db.get_val('subsystem', '')
2983 find_local_clusters(node_db)
2985 find_local_routes(lustreDB)
2987 # Two step process: (1) load modules, (2) setup lustre
2988 # if not cleaning, load modules first.
2989 prof_list = node_db.get_refs('profile')
2991 if config.write_conf:
2993 for_each_profile(node_db, prof_list, doModules)
2995 for_each_profile(node_db, prof_list, doWriteconf)
2996 for_each_profile(node_db, prof_list, doUnloadModules)
2998 elif config.recover:
2999 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3000 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3001 "--client_uuid <UUID> --conn_uuid <UUID>")
3002 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3004 elif config.cleanup:
3006 # the command line can override this value
3008 # ugly hack, only need to run lctl commands for --dump
3009 if config.lctl_dump or config.record:
3010 for_each_profile(node_db, prof_list, doCleanup)
3013 sys_set_timeout(timeout)
3014 sys_set_ptldebug(ptldebug)
3015 sys_set_subsystem(subsystem)
3016 sys_set_lustre_upcall(lustre_upcall)
3017 sys_set_portals_upcall(portals_upcall)
3019 for_each_profile(node_db, prof_list, doCleanup)
3020 for_each_profile(node_db, prof_list, doUnloadModules)
3024 # ugly hack, only need to run lctl commands for --dump
3025 if config.lctl_dump or config.record:
3026 sys_set_timeout(timeout)
3027 sys_set_lustre_upcall(lustre_upcall)
3028 for_each_profile(node_db, prof_list, doSetup)
3032 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3033 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3035 for_each_profile(node_db, prof_list, doModules)
3037 sys_set_debug_path()
3038 sys_set_ptldebug(ptldebug)
3039 sys_set_subsystem(subsystem)
3040 script = config.gdb_script
3041 run(lctl.lctl, ' modules >', script)
3043 log ("The GDB module script is in", script)
3044 # pause, so user has time to break and
3047 sys_set_timeout(timeout)
3048 sys_set_lustre_upcall(lustre_upcall)
3049 sys_set_portals_upcall(portals_upcall)
3051 for_each_profile(node_db, prof_list, doSetup)
3054 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3055 tgt = lustreDB.lookup(tgt_uuid)
3057 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3058 new_uuid = get_active_target(tgt)
3060 raise Lustre.LconfError("doRecovery: no active target found for: " +
3062 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3064 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3066 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3068 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3071 lctl.disconnect(oldnet)
3072 except CommandError, e:
3073 log("recover: disconnect", nid_uuid, "failed: ")
3078 except CommandError, e:
3079 log("recover: connect failed")
3082 lctl.recover(client_uuid, net.nid_uuid)
3085 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3086 base = os.path.dirname(cmd)
3087 if development_mode():
3088 if not config.lustre:
3089 debug('using objdir module paths')
3090 config.lustre = (os.path.join(base, ".."))
3091 # normalize the portals dir, using command line arg if set
3093 portals_dir = config.portals
3094 dir = os.path.join(config.lustre, portals_dir)
3095 config.portals = dir
3096 debug('config.portals', config.portals)
3097 elif config.lustre and config.portals:
3099 # if --lustre and --portals, normalize portals
3100 # can ignore POTRALS_DIR here, since it is probly useless here
3101 config.portals = os.path.join(config.lustre, config.portals)
3102 debug('config.portals B', config.portals)
3104 def sysctl(path, val):
3105 debug("+ sysctl", path, val)
3109 fp = open(os.path.join('/proc/sys', path), 'w')
3116 def sys_set_debug_path():
3117 sysctl('portals/debug_path', config.debug_path)
3119 def sys_set_lustre_upcall(upcall):
3120 # the command overrides the value in the node config
3121 if config.lustre_upcall:
3122 upcall = config.lustre_upcall
3124 upcall = config.upcall
3126 lctl.set_lustre_upcall(upcall)
3128 def sys_set_portals_upcall(upcall):
3129 # the command overrides the value in the node config
3130 if config.portals_upcall:
3131 upcall = config.portals_upcall
3133 upcall = config.upcall
3135 sysctl('portals/upcall', upcall)
3137 def sys_set_timeout(timeout):
3138 # the command overrides the value in the node config
3139 if config.timeout and config.timeout > 0:
3140 timeout = config.timeout
3141 if timeout != None and timeout > 0:
3142 lctl.set_timeout(timeout)
3144 def sys_tweak_socknal ():
3145 if config.single_socket:
3146 sysctl("socknal/typed", 0)
3148 def sys_optimize_elan ():
3149 procfiles = ["/proc/elan/config/eventint_punt_loops",
3150 "/proc/qsnet/elan3/config/eventint_punt_loops",
3151 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3153 if os.access(p, os.R_OK):
3154 run ("echo 1 > " + p)
3156 def sys_set_ptldebug(ptldebug):
3158 ptldebug = config.ptldebug
3161 val = eval(ptldebug, ptldebug_names)
3162 val = "0x%x" % (val)
3163 sysctl('portals/debug', val)
3164 except NameError, e:
3167 def sys_set_subsystem(subsystem):
3168 if config.subsystem:
3169 subsystem = config.subsystem
3172 val = eval(subsystem, subsystem_names)
3173 val = "0x%x" % (val)
3174 sysctl('portals/subsystem_debug', val)
3175 except NameError, e:
3178 def sys_set_netmem_max(path, max):
3179 debug("setting", path, "to at least", max)
3187 fp = open(path, 'w')
3188 fp.write('%d\n' %(max))
3192 def sys_make_devices():
3193 if not os.access('/dev/portals', os.R_OK):
3194 run('mknod /dev/portals c 10 240')
3195 if not os.access('/dev/obd', os.R_OK):
3196 run('mknod /dev/obd c 10 241')
3199 # Add dir to the global PATH, if not already there.
3200 def add_to_path(new_dir):
3201 syspath = string.split(os.environ['PATH'], ':')
3202 if new_dir in syspath:
3204 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3206 def default_debug_path():
3207 path = '/tmp/lustre-log'
3208 if os.path.isdir('/r'):
3213 def default_gdb_script():
3214 script = '/tmp/ogdb'
3215 if os.path.isdir('/r'):
3216 return '/r' + script
3221 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3222 # ensure basic elements are in the system path
3223 def sanitise_path():
3224 for dir in DEFAULT_PATH:
3227 # global hack for the --select handling
3229 def init_select(args):
3230 # args = [service=nodeA,service2=nodeB service3=nodeC]
3233 list = string.split(arg, ',')
3235 srv, node = string.split(entry, '=')
3236 tgt_select[srv] = node
3238 def get_select(srv):
3239 if tgt_select.has_key(srv):
3240 return tgt_select[srv]
3244 FLAG = Lustre.Options.FLAG
3245 PARAM = Lustre.Options.PARAM
3246 INTPARAM = Lustre.Options.INTPARAM
3247 PARAMLIST = Lustre.Options.PARAMLIST
3249 ('verbose,v', "Print system commands as they are run"),
3250 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3251 ('config', "Cluster config name used for LDAP query", PARAM),
3252 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3253 ('node', "Load config for <nodename>", PARAM),
3254 ('cleanup,d', "Cleans up config. (Shutdown)"),
3255 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3257 ('single_socket', "socknal option: only use one socket instead of bundle",
3259 ('failover',"""Used to shut down without saving state.
3260 This will allow this node to "give up" a service to a
3261 another node for failover purposes. This will not
3262 be a clean shutdown.""",
3264 ('gdb', """Prints message after creating gdb module script
3265 and sleeps for 5 seconds."""),
3266 ('noexec,n', """Prints the commands and steps that will be run for a
3267 config without executing them. This can used to check if a
3268 config file is doing what it should be doing"""),
3269 ('nomod', "Skip load/unload module step."),
3270 ('nosetup', "Skip device setup/cleanup step."),
3271 ('reformat', "Reformat all devices (without question)"),
3272 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3273 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3274 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3276 ('write_conf', "Save all the client config information on mds."),
3277 ('record', "Write config information on mds."),
3278 ('record_log', "Name of config record log.", PARAM),
3279 ('record_device', "MDS device name that will record the config commands",
3281 ('minlevel', "Minimum level of services to configure/cleanup",
3283 ('maxlevel', """Maximum level of services to configure/cleanup
3284 Levels are aproximatly like:
3289 70 - mountpoint, echo_client, osc, mdc, lov""",
3291 ('lustre', """Base directory of lustre sources. This parameter will
3292 cause lconf to load modules from a source tree.""", PARAM),
3293 ('portals', """Portals source directory. If this is a relative path,
3294 then it is assumed to be relative to lustre. """, PARAM),
3295 ('timeout', "Set recovery timeout", INTPARAM),
3296 ('upcall', "Set both portals and lustre upcall script", PARAM),
3297 ('lustre_upcall', "Set lustre upcall script", PARAM),
3298 ('portals_upcall', "Set portals upcall script", PARAM),
3299 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3300 ('ptldebug', "Set the portals debug level", PARAM),
3301 ('subsystem', "Set the portals debug subsystem", PARAM),
3302 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3303 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3304 # Client recovery options
3305 ('recover', "Recover a device"),
3306 ('group', "The group of devices to configure or cleanup", PARAM),
3307 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3308 ('client_uuid', "The failed client (required for recovery)", PARAM),
3309 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3311 ('inactive', """The name of an inactive service, to be ignored during
3312 mounting (currently OST-only). Can be repeated.""",
3317 global lctl, config, toplustreDB, CONFIG_FILE
3319 # in the upcall this is set to SIG_IGN
3320 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3322 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3324 config, args = cl.parse(sys.argv[1:])
3325 except Lustre.OptionError, e:
3329 setupModulePath(sys.argv[0])
3331 host = socket.gethostname()
3333 # the PRNG is normally seeded with time(), which is not so good for starting
3334 # time-synchronized clusters
3335 input = open('/dev/urandom', 'r')
3337 print 'Unable to open /dev/urandom!'
3339 seed = input.read(32)
3345 init_select(config.select)
3348 # allow config to be fetched via HTTP, but only with python2
3349 if sys.version[0] != '1' and args[0].startswith('http://'):
3352 config_file = urllib2.urlopen(args[0])
3353 except (urllib2.URLError, socket.error), err:
3354 if hasattr(err, 'args'):
3356 print "Could not access '%s': %s" %(args[0], err)
3358 elif not os.access(args[0], os.R_OK):
3359 print 'File not found or readable:', args[0]
3363 config_file = open(args[0], 'r')
3365 dom = xml.dom.minidom.parse(config_file)
3367 panic("%s does not appear to be a config file." % (args[0]))
3368 sys.exit(1) # make sure to die here, even in debug mode.
3370 CONFIG_FILE = args[0]
3371 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3372 if not config.config:
3373 config.config = os.path.basename(args[0])# use full path?
3374 if config.config[-4:] == '.xml':
3375 config.config = config.config[:-4]
3376 elif config.ldapurl:
3377 if not config.config:
3378 panic("--ldapurl requires --config name")
3379 dn = "config=%s,fs=lustre" % (config.config)
3380 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3381 elif config.ptldebug or config.subsystem:
3382 sys_set_ptldebug(None)
3383 sys_set_subsystem(None)
3386 print 'Missing config file or ldap URL.'
3387 print 'see lconf --help for command summary'
3390 toplustreDB = lustreDB
3392 ver = lustreDB.get_version()
3394 panic("No version found in config data, please recreate.")
3395 if ver != Lustre.CONFIG_VERSION:
3396 panic("Config version", ver, "does not match lconf version",
3397 Lustre.CONFIG_VERSION)
3401 node_list.append(config.node)
3404 node_list.append(host)
3405 node_list.append('localhost')
3407 debug("configuring for host: ", node_list)
3410 config.debug_path = config.debug_path + '-' + host
3411 config.gdb_script = config.gdb_script + '-' + host
3413 lctl = LCTLInterface('lctl')
3415 if config.lctl_dump:
3416 lctl.use_save_file(config.lctl_dump)
3419 if not (config.record_device and config.record_log):
3420 panic("When recording, both --record_log and --record_device must be specified.")
3421 lctl.clear_log(config.record_device, config.record_log)
3422 lctl.record(config.record_device, config.record_log)
3424 doHost(lustreDB, node_list)
3426 if not config.record:
3431 process_updates(db, config.record_device, config.record_log)
3433 if __name__ == "__main__":
3436 except Lustre.LconfError, e:
3438 # traceback.print_exc(file=sys.stdout)
3440 except CommandError, e:
3444 if first_cleanup_error:
3445 sys.exit(first_cleanup_error)