3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
95 "undefined" : (1 << 0),
105 "portals" : (1 << 10),
106 "socknal" : (1 << 11),
107 "qswnal" : (1 << 12),
108 "pinger" : (1 << 13),
109 "filter" : (1 << 14),
115 "ptlrouter" : (1 << 20),
121 first_cleanup_error = 0
122 def cleanup_error(rc):
123 global first_cleanup_error
124 if not first_cleanup_error:
125 first_cleanup_error = rc
127 # ============================================================
128 # debugging and error funcs
130 def fixme(msg = "this feature"):
131 raise Lustre.LconfError, msg + ' not implemented yet.'
134 msg = string.join(map(str,args))
135 if not config.noexec:
136 raise Lustre.LconfError(msg)
141 msg = string.join(map(str,args))
146 print string.strip(s)
150 msg = string.join(map(str,args))
153 # ack, python's builtin int() does not support '0x123' syntax.
154 # eval can do it, although what a hack!
158 return eval(s, {}, {})
161 except SyntaxError, e:
162 raise ValueError("not a number")
164 raise ValueError("not a number")
166 # ============================================================
167 # locally defined exceptions
168 class CommandError (exceptions.Exception):
169 def __init__(self, cmd_name, cmd_err, rc=None):
170 self.cmd_name = cmd_name
171 self.cmd_err = cmd_err
176 if type(self.cmd_err) == types.StringType:
178 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
180 print "! %s: %s" % (self.cmd_name, self.cmd_err)
181 elif type(self.cmd_err) == types.ListType:
183 print "! %s (error %d):" % (self.cmd_name, self.rc)
185 print "! %s:" % (self.cmd_name)
186 for s in self.cmd_err:
187 print "> %s" %(string.strip(s))
192 # ============================================================
193 # handle daemons, like the acceptor
195 """ Manage starting and stopping a daemon. Assumes daemon manages
196 it's own pid file. """
198 def __init__(self, cmd):
204 log(self.command, "already running.")
206 self.path = find_prog(self.command)
208 panic(self.command, "not found.")
209 ret, out = runcmd(self.path +' '+ self.command_line())
211 raise CommandError(self.path, out, ret)
215 pid = self.read_pidfile()
217 log ("killing process", pid)
219 #time.sleep(1) # let daemon die
221 log("unable to kill", self.command, e)
223 log("unable to kill", self.command)
226 pid = self.read_pidfile()
236 def read_pidfile(self):
238 fp = open(self.pidfile(), 'r')
245 def clean_pidfile(self):
246 """ Remove a stale pidfile """
247 log("removing stale pidfile:", self.pidfile())
249 os.unlink(self.pidfile())
251 log(self.pidfile(), e)
253 class AcceptorHandler(DaemonHandler):
254 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
255 DaemonHandler.__init__(self, "acceptor")
258 self.send_mem = send_mem
259 self.recv_mem = recv_mem
262 self.flags = self.flags + ' -i'
265 return "/var/run/%s-%d.pid" % (self.command, self.port)
267 def command_line(self):
268 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
272 # start the acceptors
274 if config.lctl_dump or config.record:
276 for port in acceptors.keys():
277 daemon = acceptors[port]
278 if not daemon.running():
281 def run_one_acceptor(port):
282 if config.lctl_dump or config.record:
284 if acceptors.has_key(port):
285 daemon = acceptors[port]
286 if not daemon.running():
289 panic("run_one_acceptor: No acceptor defined for port:", port)
291 def stop_acceptor(port):
292 if acceptors.has_key(port):
293 daemon = acceptors[port]
298 # ============================================================
299 # handle lctl interface
302 Manage communication with lctl
305 def __init__(self, cmd):
307 Initialize close by finding the lctl binary.
309 self.lctl = find_prog(cmd)
311 self.record_device = ''
314 debug('! lctl not found')
317 raise CommandError('lctl', "unable to find lctl binary.")
319 def use_save_file(self, file):
320 self.save_file = file
322 def record(self, dev_name, logname):
323 log("Recording log", logname, "on", dev_name)
324 self.record_device = dev_name
325 self.record_log = logname
327 def end_record(self):
328 log("End recording log", self.record_log, "on", self.record_device)
329 self.record_device = None
330 self.record_log = None
332 def set_nonblock(self, fd):
333 fl = fcntl.fcntl(fd, F_GETFL)
334 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
339 the cmds are written to stdin of lctl
340 lctl doesn't return errors when run in script mode, so
342 should modify command line to accept multiple commands, or
343 create complex command line options
347 cmds = '\n dump ' + self.save_file + '\n' + cmds
348 elif self.record_device:
352 %s""" % (self.record_device, self.record_log, cmds)
354 debug("+", cmd_line, cmds)
355 if config.noexec: return (0, [])
357 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
358 child.tochild.write(cmds + "\n")
359 child.tochild.close()
360 # print "LCTL:", cmds
362 # From "Python Cookbook" from O'Reilly
363 outfile = child.fromchild
364 outfd = outfile.fileno()
365 self.set_nonblock(outfd)
366 errfile = child.childerr
367 errfd = errfile.fileno()
368 self.set_nonblock(errfd)
370 outdata = errdata = ''
373 ready = select.select([outfd,errfd],[],[]) # Wait for input
374 if outfd in ready[0]:
375 outchunk = outfile.read()
376 if outchunk == '': outeof = 1
377 outdata = outdata + outchunk
378 if errfd in ready[0]:
379 errchunk = errfile.read()
380 if errchunk == '': erreof = 1
381 errdata = errdata + errchunk
382 if outeof and erreof: break
383 # end of "borrowed" code
386 if os.WIFEXITED(ret):
387 rc = os.WEXITSTATUS(ret)
390 if rc or len(errdata):
391 raise CommandError(self.lctl, errdata, rc)
394 def runcmd(self, *args):
396 run lctl using the command line
398 cmd = string.join(map(str,args))
399 debug("+", self.lctl, cmd)
400 rc, out = run(self.lctl, cmd)
402 raise CommandError(self.lctl, out, rc)
406 def clear_log(self, dev, log):
407 """ clear an existing log """
412 quit """ % (dev, log)
415 def network(self, net, nid):
420 quit """ % (net, nid)
423 # create a new connection
424 def add_uuid(self, net_type, uuid, nid):
425 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
428 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
430 if net_type in ('tcp',) and not config.lctl_dump:
435 add_autoconn %s %s %d %s
439 nid, hostaddr, port, flags )
442 def connect(self, srv):
443 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
444 if srv.net_type in ('tcp',) and not config.lctl_dump:
448 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
449 srv.nid, srv.hostaddr, srv.port, flags)
452 def recover(self, dev_name, new_conn):
455 recover %s""" %(dev_name, new_conn)
458 # add a route to a range
459 def add_route(self, net, gw, lo, hi):
467 except CommandError, e:
471 def del_route(self, net, gw, lo, hi):
476 quit """ % (net, gw, lo, hi)
479 # add a route to a host
480 def add_route_host(self, net, uuid, gw, tgt):
481 self.add_uuid(net, uuid, tgt)
489 except CommandError, e:
493 # add a route to a range
494 def del_route_host(self, net, uuid, gw, tgt):
500 quit """ % (net, gw, tgt)
504 def del_autoconn(self, net_type, nid, hostaddr):
505 if net_type in ('tcp',) and not config.lctl_dump:
514 # disconnect one connection
515 def disconnect(self, srv):
516 self.del_uuid(srv.nid_uuid)
517 if srv.net_type in ('tcp',) and not config.lctl_dump:
518 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
520 def del_uuid(self, uuid):
528 def disconnectAll(self, net):
536 def attach(self, type, name, uuid):
539 quit""" % (type, name, uuid)
542 def setup(self, name, setup = ""):
546 quit""" % (name, setup)
550 # create a new device with lctl
551 def newdev(self, type, name, uuid, setup = ""):
552 self.attach(type, name, uuid);
554 self.setup(name, setup)
555 except CommandError, e:
556 self.cleanup(name, uuid, 0)
561 def cleanup(self, name, uuid, force, failover = 0):
562 if failover: force = 1
568 quit""" % (name, ('', 'force')[force],
569 ('', 'failover')[failover])
573 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
574 stripe_sz, stripe_off, pattern):
577 lov_setup %s %d %d %d %s
578 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
581 # add an OBD to a LOV
582 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
584 lov_modify_tgts add %s %s %s %s
585 quit""" % (name, obd_uuid, index, gen)
589 def lmv_setup(self, name, uuid, desc_uuid, devlist):
593 quit""" % (name, uuid, desc_uuid, devlist)
596 # delete an OBD from a LOV
597 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
599 lov_modify_tgts del %s %s %s %s
600 quit""" % (name, obd_uuid, index, gen)
604 def deactivate(self, name):
612 def dump(self, dump_file):
615 quit""" % (dump_file)
618 # get list of devices
619 def device_list(self):
620 devices = '/proc/fs/lustre/devices'
622 if os.access(devices, os.R_OK):
624 fp = open(devices, 'r')
632 def lustre_version(self):
633 rc, out = self.runcmd('version')
637 def mount_option(self, profile, osc, mdc):
639 mount_option %s %s %s
640 quit""" % (profile, osc, mdc)
643 # delete mount options
644 def del_mount_option(self, profile):
650 def set_timeout(self, timeout):
656 def set_lustre_upcall(self, upcall):
661 # ============================================================
662 # Various system-level functions
663 # (ideally moved to their own module)
665 # Run a command and return the output and status.
666 # stderr is sent to /dev/null, could use popen3 to
667 # save it if necessary
670 if config.noexec: return (0, [])
671 f = os.popen(cmd + ' 2>&1')
681 cmd = string.join(map(str,args))
684 # Run a command in the background.
685 def run_daemon(*args):
686 cmd = string.join(map(str,args))
688 if config.noexec: return 0
689 f = os.popen(cmd + ' 2>&1')
697 # Determine full path to use for an external command
698 # searches dirname(argv[0]) first, then PATH
700 syspath = string.split(os.environ['PATH'], ':')
701 cmdpath = os.path.dirname(sys.argv[0])
702 syspath.insert(0, cmdpath);
704 syspath.insert(0, os.path.join(config.portals, 'utils/'))
706 prog = os.path.join(d,cmd)
707 if os.access(prog, os.X_OK):
711 # Recursively look for file starting at base dir
712 def do_find_file(base, mod):
713 fullname = os.path.join(base, mod)
714 if os.access(fullname, os.R_OK):
716 for d in os.listdir(base):
717 dir = os.path.join(base,d)
718 if os.path.isdir(dir):
719 module = do_find_file(dir, mod)
723 def find_module(src_dir, dev_dir, modname):
724 modbase = src_dir +'/'+ dev_dir +'/'+ modname
725 for modext in '.ko', '.o':
726 module = modbase + modext
728 if os.access(module, os.R_OK):
734 # is the path a block device?
741 return stat.S_ISBLK(s[stat.ST_MODE])
743 # build fs according to type
745 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
751 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
753 # devsize is in 1k, and fs block count is in 4k
754 block_cnt = devsize/4
756 if fstype in ('ext3', 'extN', 'ldiskfs'):
757 # ext3 journal size is in megabytes
760 if not is_block(dev):
761 ret, out = runcmd("ls -l %s" %dev)
762 devsize = int(string.split(out[0])[4]) / 1024
764 ret, out = runcmd("sfdisk -s %s" %dev)
765 devsize = int(out[0])
766 if devsize > 1024 * 1024:
767 jsize = ((devsize / 102400) * 4)
770 if jsize: jopt = "-J size=%d" %(jsize,)
771 if isize: iopt = "-I %d" %(isize,)
772 mkfs = 'mkfs.ext2 -j -b 4096 '
773 if not isblock or config.force:
775 elif fstype == 'reiserfs':
776 # reiserfs journal size is in blocks
777 if jsize: jopt = "--journal_size %d" %(jsize,)
778 mkfs = 'mkreiserfs -ff'
780 panic('unsupported fs type: ', fstype)
782 if config.mkfsoptions != None:
783 mkfs = mkfs + ' ' + config.mkfsoptions
784 if mkfsoptions != None:
785 mkfs = mkfs + ' ' + mkfsoptions
786 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
788 panic("Unable to build fs:", dev, string.join(out))
789 # enable hash tree indexing on fsswe
790 if fstype in ('ext3', 'extN', 'ldiskfs'):
791 htree = 'echo "feature FEATURE_C5" | debugfs -w'
792 (ret, out) = run (htree, dev)
794 panic("Unable to enable htree:", dev)
796 # some systems use /dev/loopN, some /dev/loop/N
800 if not os.access(loop + str(0), os.R_OK):
802 if not os.access(loop + str(0), os.R_OK):
803 panic ("can't access loop devices")
806 # find loop device assigned to the file
807 def find_assigned_loop(file):
809 for n in xrange(0, MAX_LOOP_DEVICES):
811 if os.access(dev, os.R_OK):
812 (stat, out) = run('losetup', dev)
813 if out and stat == 0:
814 m = re.search(r'\((.*)\)', out[0])
815 if m and file == m.group(1):
821 # create file if necessary and assign the first free loop device
822 def init_loop(file, size, fstype, journal_size, inode_size,
823 mkfsoptions, reformat, autoformat, backfstype, backfile):
826 realfstype = backfstype
827 if is_block(backfile):
828 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
829 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
835 dev = find_assigned_loop(realfile)
837 print 'WARNING file:', realfile, 'already mapped to', dev
840 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
842 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
843 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
845 panic("Unable to create backing store:", realfile)
847 mkfs(realfile, size, realfstype, journal_size, inode_size,
848 mkfsoptions, isblock=0)
851 # find next free loop
852 for n in xrange(0, MAX_LOOP_DEVICES):
854 if os.access(dev, os.R_OK):
855 (stat, out) = run('losetup', dev)
857 run('losetup', dev, realfile)
860 print "out of loop devices"
862 print "out of loop devices"
865 # undo loop assignment
866 def clean_loop(file):
867 dev = find_assigned_loop(file)
869 ret, out = run('losetup -d', dev)
871 log('unable to clean loop device:', dev, 'for file:', file)
874 # determine if dev is formatted as a <fstype> filesystem
875 def need_format(fstype, dev):
876 # FIXME don't know how to implement this
879 # initialize a block device if needed
880 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
881 inode_size, mkfsoptions, backfstype, backdev):
885 if fstype == 'smfs' or not is_block(dev):
886 dev = init_loop(dev, size, fstype, journal_size, inode_size,
887 mkfsoptions, reformat, autoformat, backfstype, backdev)
888 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
889 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
892 # panic("device:", dev,
893 # "not prepared, and autoformat is not set.\n",
894 # "Rerun with --reformat option to format ALL filesystems")
899 """lookup IP address for an interface"""
900 rc, out = run("/sbin/ifconfig", iface)
903 addr = string.split(out[1])[1]
904 ip = string.split(addr, ':')[1]
907 def def_mount_options(fstype, target):
908 """returns deafult mount options for passed fstype and target (mds, ost)"""
909 if fstype == 'ext3' or fstype == 'ldiskfs':
910 mountfsoptions = "errors=remount-ro"
911 if target == 'ost' and sys_get_branch() == '2.4':
912 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
913 return mountfsoptions
916 def sys_get_elan_position_file():
917 procfiles = ["/proc/elan/device0/position",
918 "/proc/qsnet/elan4/device0/position",
919 "/proc/qsnet/elan3/device0/position"]
921 if os.access(p, os.R_OK):
925 def sys_get_local_nid(net_type, wildcard, cluster_id):
926 """Return the local nid."""
928 if sys_get_elan_position_file():
929 local = sys_get_local_address('elan', '*', cluster_id)
931 local = sys_get_local_address(net_type, wildcard, cluster_id)
934 def sys_get_local_address(net_type, wildcard, cluster_id):
935 """Return the local address for the network type."""
937 if net_type in ('tcp',):
939 iface, star = string.split(wildcard, ':')
940 local = if2addr(iface)
942 panic ("unable to determine ip for:", wildcard)
944 host = socket.gethostname()
945 local = socket.gethostbyname(host)
946 elif net_type == 'elan':
947 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
948 f = sys_get_elan_position_file()
950 panic ("unable to determine local Elan ID")
953 lines = fp.readlines()
961 nid = my_int(cluster_id) + my_int(elan_id)
963 except ValueError, e:
967 elif net_type == 'gm':
968 fixme("automatic local address for GM")
972 def sys_get_branch():
973 """Returns kernel release"""
975 fp = open('/proc/sys/kernel/osrelease')
976 lines = fp.readlines()
980 version = string.split(l)
981 a = string.split(version[0], '.')
982 return a[0] + '.' + a[1]
988 def mod_loaded(modname):
989 """Check if a module is already loaded. Look in /proc/modules for it."""
991 fp = open('/proc/modules')
992 lines = fp.readlines()
994 # please forgive my tired fingers for this one
995 ret = filter(lambda word, mod=modname: word == mod,
996 map(lambda line: string.split(line)[0], lines))
1001 # XXX: instead of device_list, ask for $name and see what we get
1002 def is_prepared(name):
1003 """Return true if a device exists for the name"""
1004 if config.lctl_dump:
1006 if (config.noexec or config.record) and config.cleanup:
1009 # expect this format:
1010 # 1 UP ldlm ldlm ldlm_UUID 2
1011 out = lctl.device_list()
1013 if name == string.split(s)[3]:
1015 except CommandError, e:
1019 def is_network_prepared():
1020 """If the any device exists, then assume that all networking
1021 has been configured"""
1022 out = lctl.device_list()
1025 def fs_is_mounted(path):
1026 """Return true if path is a mounted lustre filesystem"""
1028 fp = open('/proc/mounts')
1029 lines = fp.readlines()
1033 if a[1] == path and a[2] == 'lustre_lite':
1041 """Manage kernel modules"""
1042 def __init__(self, lustre_dir, portals_dir):
1043 self.lustre_dir = lustre_dir
1044 self.portals_dir = portals_dir
1045 self.kmodule_list = []
1047 def add_portals_module(self, dev_dir, modname):
1048 """Append a module to list of modules to load."""
1049 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1051 def add_lustre_module(self, dev_dir, modname):
1052 """Append a module to list of modules to load."""
1053 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1055 def load_module(self):
1056 """Load all the modules in the list in the order they appear."""
1057 for src_dir, dev_dir, mod in self.kmodule_list:
1058 if mod_loaded(mod) and not config.noexec:
1060 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1062 module = find_module(src_dir, dev_dir, mod)
1064 panic('module not found:', mod)
1065 (rc, out) = run('/sbin/insmod', module)
1067 raise CommandError('insmod', out, rc)
1069 (rc, out) = run('/sbin/modprobe', mod)
1071 raise CommandError('modprobe', out, rc)
1073 def cleanup_module(self):
1074 """Unload the modules in the list in reverse order."""
1075 rev = self.kmodule_list
1077 for src_dir, dev_dir, mod in rev:
1078 if not mod_loaded(mod) and not config.noexec:
1081 if mod == 'portals' and config.dump:
1082 lctl.dump(config.dump)
1083 log('unloading module:', mod)
1084 (rc, out) = run('/sbin/rmmod', mod)
1086 log('! unable to unload module:', mod)
1089 # ============================================================
1090 # Classes to prepare and cleanup the various objects
1093 """ Base class for the rest of the modules. The default cleanup method is
1094 defined here, as well as some utilitiy funcs.
1096 def __init__(self, module_name, db):
1098 self.module_name = module_name
1099 self.name = self.db.getName()
1100 self.uuid = self.db.getUUID()
1103 self.kmod = kmod(config.lustre, config.portals)
1105 def info(self, *args):
1106 msg = string.join(map(str,args))
1107 print self.module_name + ":", self.name, self.uuid, msg
1110 """ default cleanup, used for most modules """
1113 lctl.cleanup(self.name, self.uuid, config.force)
1114 except CommandError, e:
1115 log(self.module_name, "cleanup failed: ", self.name)
1119 def add_portals_module(self, dev_dir, modname):
1120 """Append a module to list of modules to load."""
1121 self.kmod.add_portals_module(dev_dir, modname)
1123 def add_lustre_module(self, dev_dir, modname):
1124 """Append a module to list of modules to load."""
1125 self.kmod.add_lustre_module(dev_dir, modname)
1127 def load_module(self):
1128 """Load all the modules in the list in the order they appear."""
1129 self.kmod.load_module()
1131 def cleanup_module(self):
1132 """Unload the modules in the list in reverse order."""
1133 if self.safe_to_clean():
1134 self.kmod.cleanup_module()
1136 def safe_to_clean(self):
1139 def safe_to_clean_modules(self):
1140 return self.safe_to_clean()
1142 class Network(Module):
1143 def __init__(self,db):
1144 Module.__init__(self, 'NETWORK', db)
1145 self.net_type = self.db.get_val('nettype')
1146 self.nid = self.db.get_val('nid', '*')
1147 self.cluster_id = self.db.get_val('clusterid', "0")
1148 self.port = self.db.get_val_int('port', 0)
1149 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1150 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1151 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1154 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1156 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1157 self.generic_nid = 1
1158 debug("nid:", self.nid)
1160 self.generic_nid = 0
1162 self.nid_uuid = self.nid_to_uuid(self.nid)
1164 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1165 if '*' in self.hostaddr:
1166 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1167 if not self.hostaddr:
1168 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1169 debug("hostaddr:", self.hostaddr)
1171 self.add_portals_module("libcfs", 'libcfs')
1172 self.add_portals_module("portals", 'portals')
1173 if node_needs_router():
1174 self.add_portals_module("router", 'kptlrouter')
1175 if self.net_type == 'tcp':
1176 self.add_portals_module("knals/socknal", 'ksocknal')
1177 if self.net_type == 'elan':
1178 self.add_portals_module("knals/qswnal", 'kqswnal')
1179 if self.net_type == 'gm':
1180 self.add_portals_module("knals/gmnal", 'kgmnal')
1182 def nid_to_uuid(self, nid):
1183 return "NID_%s_UUID" %(nid,)
1186 if not config.record and is_network_prepared():
1188 self.info(self.net_type, self.nid, self.port)
1189 if not (config.record and self.generic_nid):
1190 lctl.network(self.net_type, self.nid)
1191 if self.net_type == 'tcp':
1193 if self.net_type == 'elan':
1195 if self.port and node_is_router():
1196 run_one_acceptor(self.port)
1197 self.connect_peer_gateways()
1199 def connect_peer_gateways(self):
1200 for router in self.db.lookup_class('node'):
1201 if router.get_val_int('router', 0):
1202 for netuuid in router.get_networks():
1203 net = self.db.lookup(netuuid)
1205 if (gw.cluster_id == self.cluster_id and
1206 gw.net_type == self.net_type):
1207 if gw.nid != self.nid:
1210 def disconnect_peer_gateways(self):
1211 for router in self.db.lookup_class('node'):
1212 if router.get_val_int('router', 0):
1213 for netuuid in router.get_networks():
1214 net = self.db.lookup(netuuid)
1216 if (gw.cluster_id == self.cluster_id and
1217 gw.net_type == self.net_type):
1218 if gw.nid != self.nid:
1221 except CommandError, e:
1222 print "disconnect failed: ", self.name
1226 def safe_to_clean(self):
1227 return not is_network_prepared()
1230 self.info(self.net_type, self.nid, self.port)
1232 stop_acceptor(self.port)
1233 if node_is_router():
1234 self.disconnect_peer_gateways()
1236 def correct_level(self, level, op=None):
1239 class RouteTable(Module):
1240 def __init__(self,db):
1241 Module.__init__(self, 'ROUTES', db)
1243 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1245 # only setup connections for tcp NALs
1247 if not net_type in ('tcp',):
1250 # connect to target if route is to single node and this node is the gw
1251 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1252 if not local_cluster(net_type, tgt_cluster_id):
1253 panic("target", lo, " not on the local cluster")
1254 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1255 # connect to gateway if this node is not the gw
1256 elif (local_cluster(net_type, gw_cluster_id)
1257 and not local_interface(net_type, gw_cluster_id, gw)):
1258 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1263 panic("no server for nid", lo)
1266 return Network(srvdb)
1269 if not config.record and is_network_prepared():
1272 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1273 lctl.add_route(net_type, gw, lo, hi)
1274 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1278 def safe_to_clean(self):
1279 return not is_network_prepared()
1282 if is_network_prepared():
1283 # the network is still being used, don't clean it up
1285 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1286 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1289 lctl.disconnect(srv)
1290 except CommandError, e:
1291 print "disconnect failed: ", self.name
1296 lctl.del_route(net_type, gw, lo, hi)
1297 except CommandError, e:
1298 print "del_route failed: ", self.name
1302 class Management(Module):
1303 def __init__(self, db):
1304 Module.__init__(self, 'MGMT', db)
1305 self.add_lustre_module('lvfs', 'lvfs')
1306 self.add_lustre_module('obdclass', 'obdclass')
1307 self.add_lustre_module('ptlrpc', 'ptlrpc')
1308 self.add_lustre_module('mgmt', 'mgmt_svc')
1311 if not config.record and is_prepared(self.name):
1314 lctl.newdev("mgmt", self.name, self.uuid)
1316 def safe_to_clean(self):
1320 if is_prepared(self.name):
1321 Module.cleanup(self)
1323 def correct_level(self, level, op=None):
1326 # This is only needed to load the modules; the LDLM device
1327 # is now created automatically.
1329 def __init__(self,db):
1330 Module.__init__(self, 'LDLM', db)
1331 self.add_lustre_module('lvfs', 'lvfs')
1332 self.add_lustre_module('obdclass', 'obdclass')
1333 self.add_lustre_module('ptlrpc', 'ptlrpc')
1341 def correct_level(self, level, op=None):
1346 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1347 Module.__init__(self, 'LOV', db)
1348 if name_override != None:
1349 self.name = "lov_%s" % name_override
1350 self.add_lustre_module('lov', 'lov')
1351 self.mds_uuid = self.db.get_first_ref('mds')
1352 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1353 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1354 self.pattern = self.db.get_val_int('stripepattern', 0)
1355 self.devlist = self.db.get_lov_tgts('lov_tgt')
1356 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1358 self.desc_uuid = self.uuid
1359 self.uuid = generate_client_uuid(self.name)
1360 self.fs_name = fs_name
1362 self.config_only = 1
1364 self.config_only = None
1365 mds = self.db.lookup(self.mds_uuid)
1366 self.mds_name = mds.getName()
1367 for (obd_uuid, index, gen, active) in self.devlist:
1370 obd = self.db.lookup(obd_uuid)
1371 osc = get_osc(obd, self.uuid, fs_name)
1373 self.osclist.append((osc, index, gen, active))
1375 panic('osc not found:', obd_uuid)
1381 if not config.record and is_prepared(self.name):
1383 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1384 self.stripe_off, self.pattern, self.devlist,
1386 lctl.lov_setup(self.name, self.uuid,
1387 self.desc_uuid, self.mds_name, self.stripe_cnt,
1388 self.stripe_sz, self.stripe_off, self.pattern)
1389 for (osc, index, gen, active) in self.osclist:
1390 target_uuid = osc.target_uuid
1392 # Only ignore connect failures with --force, which
1393 # isn't implemented here yet.
1395 osc.prepare(ignore_connect_failure=0)
1396 except CommandError, e:
1397 print "Error preparing OSC %s\n" % osc.uuid
1399 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1402 for (osc, index, gen, active) in self.osclist:
1403 target_uuid = osc.target_uuid
1405 if is_prepared(self.name):
1406 Module.cleanup(self)
1407 if self.config_only:
1408 panic("Can't clean up config_only LOV ", self.name)
1410 def load_module(self):
1411 if self.config_only:
1412 panic("Can't load modules for config_only LOV ", self.name)
1413 for (osc, index, gen, active) in self.osclist:
1416 Module.load_module(self)
1418 def cleanup_module(self):
1419 if self.config_only:
1420 panic("Can't cleanup modules for config_only LOV ", self.name)
1421 Module.cleanup_module(self)
1422 for (osc, index, gen, active) in self.osclist:
1424 osc.cleanup_module()
1427 def correct_level(self, level, op=None):
1431 def __init__(self, db, uuid, fs_name, name_override = None):
1432 Module.__init__(self, 'LMV', db)
1433 if name_override != None:
1434 self.name = "lmv_%s" % name_override
1435 self.add_lustre_module('lmv', 'lmv')
1436 self.devlist = self.db.get_refs('mds')
1438 self.desc_uuid = self.uuid
1440 self.fs_name = fs_name
1441 for mds_uuid in self.devlist:
1442 mds = self.db.lookup(mds_uuid)
1444 panic("MDS not found!")
1445 mdc = MDC(mds, self.uuid, fs_name)
1447 self.mdclist.append(mdc)
1449 panic('mdc not found:', mds_uuid)
1452 if is_prepared(self.name):
1454 for mdc in self.mdclist:
1456 # Only ignore connect failures with --force, which
1457 # isn't implemented here yet.
1458 mdc.prepare(ignore_connect_failure=0)
1459 except CommandError, e:
1460 print "Error preparing LMV %s\n" % mdc.uuid
1462 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1463 string.join(self.devlist))
1466 for mdc in self.mdclist:
1468 if is_prepared(self.name):
1469 Module.cleanup(self)
1471 def load_module(self):
1472 for mdc in self.mdclist:
1475 Module.load_module(self)
1477 def cleanup_module(self):
1478 Module.cleanup_module(self)
1479 for mds in self.mdclist:
1480 mdc.cleanup_module()
1483 def correct_level(self, level, op=None):
1486 class MDSDEV(Module):
1487 def __init__(self,db):
1488 Module.__init__(self, 'MDSDEV', db)
1489 self.devpath = self.db.get_val('devpath','')
1490 self.backdevpath = self.db.get_val('backdevpath','')
1491 self.size = self.db.get_val_int('devsize', 0)
1492 self.journal_size = self.db.get_val_int('journalsize', 0)
1493 self.fstype = self.db.get_val('fstype', '')
1494 self.backfstype = self.db.get_val('backfstype', '')
1495 self.nspath = self.db.get_val('nspath', '')
1496 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1497 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1498 self.cachetype = self.db.get_val('cachetype', '')
1499 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1500 target_uuid = self.db.get_first_ref('target')
1501 mds = self.db.lookup(target_uuid)
1502 self.name = mds.getName()
1503 self.filesystem_uuids = mds.get_refs('filesystem')
1506 self.master_mds = ""
1507 if not self.filesystem_uuids:
1508 self.lmv_uuid = self.db.get_first_ref('lmv')
1509 if not self.lmv_uuid:
1510 panic("ALERT: can't find lvm uuid")
1512 self.lmv = self.db.lookup(self.lmv_uuid)
1514 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1515 self.master_mds = self.lmv_uuid
1516 # FIXME: if fstype not set, then determine based on kernel version
1517 self.format = self.db.get_val('autoformat', "no")
1518 if mds.get_val('failover', 0):
1519 self.failover_mds = 'f'
1521 self.failover_mds = 'n'
1522 active_uuid = get_active_target(mds)
1524 panic("No target device found:", target_uuid)
1525 if active_uuid == self.uuid:
1529 if self.active and config.group and config.group != mds.get_val('group'):
1532 self.inode_size = self.db.get_val_int('inodesize', 0)
1533 if self.inode_size == 0:
1534 # find the LOV for this MDS
1535 lovconfig_uuid = mds.get_first_ref('lovconfig')
1536 if not lovconfig_uuid:
1537 if not self.lmv_uuid:
1538 panic("No LOV found for lovconfig ", lovconfig.name)
1541 panic("No LMV initialized and not lovconfig_uuid found")
1543 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1544 lovconfig = self.lmv.lookup(lovconfig_uuid)
1545 lov_uuid = lovconfig.get_first_ref('lov')
1547 panic("No LOV found for lovconfig ", lovconfig.name)
1549 lovconfig = mds.lookup(lovconfig_uuid)
1550 lov_uuid = lovconfig.get_first_ref('lov')
1552 panic("No LOV found for lovconfig ", lovconfig.name)
1555 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1556 lovconfig = self.lmv.lookup(lovconfig_uuid)
1557 lov_uuid = lovconfig.get_first_ref('lov')
1559 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1561 # default stripe count controls default inode_size
1562 stripe_count = lov.stripe_cnt
1563 if stripe_count > 77:
1564 self.inode_size = 4096
1565 elif stripe_count > 35:
1566 self.inode_size = 2048
1567 elif stripe_count > 13:
1568 self.inode_size = 1024
1569 elif stripe_count > 3:
1570 self.inode_size = 512
1572 self.inode_size = 256
1574 self.target_dev_uuid = self.uuid
1575 self.uuid = target_uuid
1578 client_uuid = generate_client_uuid(self.name)
1579 client_uuid = self.name + "_lmv_" + "UUID"
1580 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1581 self.master_mds = self.master.name
1584 self.add_lustre_module('mdc', 'mdc')
1585 self.add_lustre_module('osc', 'osc')
1586 self.add_lustre_module('lov', 'lov')
1587 self.add_lustre_module('lmv', 'lmv')
1588 self.add_lustre_module('ost', 'ost')
1589 self.add_lustre_module('mds', 'mds')
1591 if self.fstype == 'smfs':
1592 self.add_lustre_module('smfs', 'smfs')
1594 if self.fstype == 'ldiskfs':
1595 self.add_lustre_module('ldiskfs', 'ldiskfs')
1598 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1600 # if fstype is smfs, then we should also take care about backing
1602 if self.fstype == 'smfs':
1603 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1605 def load_module(self):
1607 Module.load_module(self)
1610 if not config.record and is_prepared(self.name):
1613 debug(self.uuid, "not active")
1616 # run write_conf automatically, if --reformat used
1618 self.info(self.devpath, self.fstype, self.size, self.format)
1622 self.master.prepare()
1623 # never reformat here
1624 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1625 self.format, self.journal_size, self.inode_size,
1626 self.mkfsoptions, self.backfstype, self.backdevpath)
1628 if not is_prepared('MDT'):
1629 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1631 mountfsoptions = def_mount_options(self.fstype, 'mds')
1633 if config.mountfsoptions:
1635 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1637 mountfsoptions = config.mountfsoptions
1638 if self.mountfsoptions:
1639 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1641 if self.mountfsoptions:
1643 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1645 mountfsoptions = self.mountfsoptions
1647 if self.fstype == 'smfs':
1648 realdev = self.fstype
1651 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1655 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1660 print 'MDS mount options: ' + mountfsoptions
1662 if not self.master_mds:
1663 self.master_mds = 'dumb'
1664 if not self.cachetype:
1665 self.cachetype = 'dumb'
1666 lctl.newdev("mds", self.name, self.uuid,
1667 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1668 self.name, mountfsoptions,
1669 self.master_mds, self.cachetype))
1670 except CommandError, e:
1672 panic("MDS is missing the config log. Need to run " +
1673 "lconf --write_conf.")
1677 def write_conf(self):
1679 if not is_prepared(self.name):
1680 self.info(self.devpath, self.fstype, self.format)
1682 blkdev = block_dev(self.devpath, self.size, self.fstype,
1683 config.reformat, self.format, self.journal_size,
1684 self.inode_size, self.mkfsoptions,
1685 self.backfstype, self.backdevpath)
1687 # Even for writing logs we mount mds with supplied mount options
1688 # because it will not mount smfs (if used) otherwise.
1690 mountfsoptions = def_mount_options(self.fstype, 'mds')
1692 if config.mountfsoptions:
1694 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1696 mountfsoptions = config.mountfsoptions
1697 if self.mountfsoptions:
1698 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1700 if self.mountfsoptions:
1702 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1704 mountfsoptions = self.mountfsoptions
1706 if self.fstype == 'smfs':
1707 realdev = self.fstype
1710 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1714 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1719 print 'MDS mount options: ' + mountfsoptions
1721 # As mount options are passed by 4th param to config tool, we need
1722 # to pass something in 3rd param. But we do not want this 3rd param
1723 # be counted as a profile name for reading log on MDS setup, thus,
1724 # we pass there some predefined sign like 'dumb', which will be
1725 # checked in MDS code and skipped. Probably there is more nice way
1726 # like pass empty string and check it in config tool and pass null
1728 lctl.newdev("mds", self.name, self.uuid,
1729 setup ="%s %s %s %s" %(realdev, self.fstype,
1730 'dumb', mountfsoptions))
1733 # record logs for the MDS lov
1734 for uuid in self.filesystem_uuids:
1735 log("recording clients for filesystem:", uuid)
1736 fs = self.db.lookup(uuid)
1738 # this is ugly, should be organized nice later.
1739 target_uuid = self.db.get_first_ref('target')
1740 mds = self.db.lookup(target_uuid)
1742 lovconfig_uuid = mds.get_first_ref('lovconfig')
1744 lovconfig = mds.lookup(lovconfig_uuid)
1745 obd_uuid = lovconfig.get_first_ref('lov')
1747 obd_uuid = fs.get_first_ref('obd')
1749 client_uuid = generate_client_uuid(self.name)
1750 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1753 lctl.clear_log(self.name, self.name)
1754 lctl.record(self.name, self.name)
1756 lctl.mount_option(self.name, client.get_name(), "")
1758 process_updates(self.db, self.name, self.name, client)
1761 lctl.clear_log(self.name, self.name + '-clean')
1762 lctl.record(self.name, self.name + '-clean')
1764 lctl.del_mount_option(self.name)
1766 process_updates(self.db, self.name, self.name + '-clean', client)
1770 # record logs for each client
1776 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1778 config_options = CONFIG_FILE
1780 for node_db in self.db.lookup_class('node'):
1781 client_name = node_db.getName()
1782 for prof_uuid in node_db.get_refs('profile'):
1783 prof_db = node_db.lookup(prof_uuid)
1784 # refactor this into a funtion to test "clientness"
1786 for ref_class, ref_uuid in prof_db.get_all_refs():
1787 if ref_class in ('mountpoint','echoclient'):
1788 debug("recording", client_name)
1789 old_noexec = config.noexec
1791 ret, out = run (sys.argv[0], noexec_opt,
1792 " -v --record --nomod",
1793 "--record_log", client_name,
1794 "--record_device", self.name,
1795 "--node", client_name,
1798 for s in out: log("record> ", string.strip(s))
1799 ret, out = run (sys.argv[0], noexec_opt,
1800 "--cleanup -v --record --nomod",
1801 "--record_log", client_name + "-clean",
1802 "--record_device", self.name,
1803 "--node", client_name,
1806 for s in out: log("record> ", string.strip(s))
1807 config.noexec = old_noexec
1810 lctl.cleanup(self.name, self.uuid, 0, 0)
1811 except CommandError, e:
1812 log(self.module_name, "cleanup failed: ", self.name)
1815 Module.cleanup(self)
1817 if self.fstype == 'smfs':
1818 clean_loop(self.backdevpath)
1820 clean_loop(self.devpath)
1822 def msd_remaining(self):
1823 out = lctl.device_list()
1825 if string.split(s)[2] in ('mds',):
1828 def safe_to_clean(self):
1831 def safe_to_clean_modules(self):
1832 return not self.msd_remaining()
1836 debug(self.uuid, "not active")
1839 if is_prepared(self.name):
1841 lctl.cleanup(self.name, self.uuid, config.force,
1843 except CommandError, e:
1844 log(self.module_name, "cleanup failed: ", self.name)
1847 Module.cleanup(self)
1850 self.master.cleanup()
1851 if not self.msd_remaining() and is_prepared('MDT'):
1853 lctl.cleanup("MDT", "MDT_UUID", config.force,
1855 except CommandError, e:
1856 print "cleanup failed: ", self.name
1860 if self.fstype == 'smfs':
1861 clean_loop(self.backdevpath)
1863 clean_loop(self.devpath)
1865 def correct_level(self, level, op=None):
1866 #if self.master_mds:
1871 def __init__(self, db):
1872 Module.__init__(self, 'OSD', db)
1873 self.osdtype = self.db.get_val('osdtype')
1874 self.devpath = self.db.get_val('devpath', '')
1875 self.backdevpath = self.db.get_val('backdevpath', '')
1876 self.size = self.db.get_val_int('devsize', 0)
1877 self.journal_size = self.db.get_val_int('journalsize', 0)
1878 self.inode_size = self.db.get_val_int('inodesize', 0)
1879 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1880 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1881 self.fstype = self.db.get_val('fstype', '')
1882 self.backfstype = self.db.get_val('backfstype', '')
1883 self.nspath = self.db.get_val('nspath', '')
1884 target_uuid = self.db.get_first_ref('target')
1885 ost = self.db.lookup(target_uuid)
1886 self.name = ost.getName()
1887 self.format = self.db.get_val('autoformat', 'yes')
1888 if ost.get_val('failover', 0):
1889 self.failover_ost = 'f'
1891 self.failover_ost = 'n'
1893 active_uuid = get_active_target(ost)
1895 panic("No target device found:", target_uuid)
1896 if active_uuid == self.uuid:
1900 if self.active and config.group and config.group != ost.get_val('group'):
1903 self.target_dev_uuid = self.uuid
1904 self.uuid = target_uuid
1906 self.add_lustre_module('ost', 'ost')
1907 if self.fstype == 'smfs':
1908 self.add_lustre_module('smfs', 'smfs')
1909 # FIXME: should we default to ext3 here?
1910 if self.fstype == 'ldiskfs':
1911 self.add_lustre_module('ldiskfs', 'ldiskfs')
1913 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1914 if self.fstype == 'smfs':
1915 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1917 self.add_lustre_module(self.osdtype, self.osdtype)
1919 def load_module(self):
1921 Module.load_module(self)
1923 # need to check /proc/mounts and /etc/mtab before
1924 # formatting anything.
1925 # FIXME: check if device is already formatted.
1927 if is_prepared(self.name):
1930 debug(self.uuid, "not active")
1932 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1933 self.format, self.journal_size, self.inode_size)
1935 if self.osdtype == 'obdecho':
1938 blkdev = block_dev(self.devpath, self.size, self.fstype,
1939 config.reformat, self.format, self.journal_size,
1940 self.inode_size, self.mkfsoptions, self.backfstype,
1943 mountfsoptions = def_mount_options(self.fstype, 'ost')
1945 if config.mountfsoptions:
1947 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1949 mountfsoptions = config.mountfsoptions
1950 if self.mountfsoptions:
1951 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1953 if self.mountfsoptions:
1955 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1957 mountfsoptions = self.mountfsoptions
1959 if self.fstype == 'smfs':
1960 realdev = self.fstype
1963 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1967 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1972 print 'OSD mount options: ' + mountfsoptions
1974 lctl.newdev(self.osdtype, self.name, self.uuid,
1975 setup ="%s %s %s %s" %(realdev, self.fstype,
1978 if not is_prepared('OSS'):
1979 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1981 def osd_remaining(self):
1982 out = lctl.device_list()
1984 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1987 def safe_to_clean(self):
1990 def safe_to_clean_modules(self):
1991 return not self.osd_remaining()
1995 debug(self.uuid, "not active")
1997 if is_prepared(self.name):
2000 lctl.cleanup(self.name, self.uuid, config.force,
2002 except CommandError, e:
2003 log(self.module_name, "cleanup failed: ", self.name)
2006 if not self.osd_remaining() and is_prepared('OSS'):
2008 lctl.cleanup("OSS", "OSS_UUID", config.force,
2010 except CommandError, e:
2011 print "cleanup failed: ", self.name
2014 if not self.osdtype == 'obdecho':
2015 if self.fstype == 'smfs':
2016 clean_loop(self.backdevpath)
2018 clean_loop(self.devpath)
2020 def correct_level(self, level, op=None):
2023 def mgmt_uuid_for_fs(mtpt_name):
2026 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2027 fs_uuid = mtpt_db.get_first_ref('filesystem')
2028 fs = toplustreDB.lookup(fs_uuid)
2031 return fs.get_first_ref('mgmt')
2033 # Generic client module, used by OSC and MDC
2034 class Client(Module):
2035 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2037 self.target_name = tgtdb.getName()
2038 self.target_uuid = tgtdb.getUUID()
2042 self.tgt_dev_uuid = get_active_target(tgtdb)
2043 if not self.tgt_dev_uuid:
2044 panic("No target device found for target(1):", self.target_name)
2046 self.kmod = kmod(config.lustre, config.portals)
2050 self.module = module
2051 self.module_name = string.upper(module)
2053 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2054 self.target_name, fs_name)
2056 self.name = self_name
2058 self.lookup_server(self.tgt_dev_uuid)
2059 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2061 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2064 self.fs_name = fs_name
2067 self.add_lustre_module(module_dir, module)
2069 def lookup_server(self, srv_uuid):
2070 """ Lookup a server's network information """
2071 self._server_nets = get_ost_net(self.db, srv_uuid)
2072 if len(self._server_nets) == 0:
2073 panic ("Unable to find a server for:", srv_uuid)
2076 def get_servers(self):
2077 return self._server_nets
2079 def prepare(self, ignore_connect_failure = 0):
2080 self.info(self.target_uuid)
2081 if not config.record and is_prepared(self.name):
2084 srv = choose_local_server(self.get_servers())
2088 routes = find_route(self.get_servers())
2089 if len(routes) == 0:
2090 panic ("no route to", self.target_uuid)
2091 for (srv, r) in routes:
2092 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2093 except CommandError, e:
2094 if not ignore_connect_failure:
2097 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2098 debug("%s inactive" % self.target_uuid)
2099 inactive_p = "inactive"
2101 debug("%s active" % self.target_uuid)
2103 lctl.newdev(self.module, self.name, self.uuid,
2104 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2105 inactive_p, self.mgmt_name))
2108 if is_prepared(self.name):
2109 Module.cleanup(self)
2111 srv = choose_local_server(self.get_servers())
2113 lctl.disconnect(srv)
2115 for (srv, r) in find_route(self.get_servers()):
2116 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2117 except CommandError, e:
2118 log(self.module_name, "cleanup failed: ", self.name)
2122 def correct_level(self, level, op=None):
2125 def deactivate(self):
2127 lctl.deactivate(self.name)
2128 except CommandError, e:
2129 log(self.module_name, "deactivate failed: ", self.name)
2134 def __init__(self, db, uuid, fs_name):
2135 Client.__init__(self, db, uuid, 'mdc', fs_name)
2137 def permits_inactive(self):
2141 def __init__(self, db, uuid, fs_name):
2142 Client.__init__(self, db, uuid, 'osc', fs_name)
2144 def permits_inactive(self):
2147 def mgmtcli_name_for_uuid(uuid):
2148 return 'MGMTCLI_%s' % uuid
2150 class ManagementClient(Client):
2151 def __init__(self, db, uuid):
2152 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2153 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2154 module_dir = 'mgmt')
2157 def __init__(self, db, uuid, name, type, name_override = None):
2158 Module.__init__(self, 'COBD', db)
2159 self.name = self.db.getName();
2160 self.uuid = generate_client_uuid(self.name)
2161 self.real_uuid = self.db.get_first_ref('realobd')
2162 self.cache_uuid = self.db.get_first_ref('cacheobd')
2163 self.add_lustre_module('cobd', 'cobd')
2164 real_obd = self.db.lookup(self.real_uuid)
2166 panic('real obd not found:', self.real_uuid)
2167 cache_obd = self.db.lookup(self.cache_uuid)
2169 panic('cache obd not found:', self.cache_uuid)
2171 self.real = LOV(real_obd, self.real_uuid, name,
2172 "%s_real" % (self.name));
2173 self.cache = LOV(cache_obd, self.cache_uuid, name,
2174 "%s_cache" % (self.name));
2176 self.real = get_mdc(db, uuid, name, self.real_uuid)
2177 self.cache = get_mdc(db, uuid, name, self.cache_uuid)
2178 # need to check /proc/mounts and /etc/mtab before
2179 # formatting anything.
2180 # FIXME: check if device is already formatted.
2185 def get_real_name(self):
2186 return self.real.name
2187 def get_cache_name(self):
2188 return self.cache.name
2191 self.cache.prepare()
2192 if not config.record and is_prepared(self.name):
2194 self.info(self.real_uuid, self.cache_uuid)
2195 lctl.newdev("cobd", self.name, self.uuid,
2196 setup ="%s %s" %(self.real.name,
2200 if is_prepared(self.name):
2201 Module.cleanup(self)
2203 self.cache.cleanup()
2205 def load_module(self):
2206 self.real.load_module()
2207 Module.load_module(self)
2209 def cleanup_module(self):
2210 Module.cleanup_module(self)
2211 self.real.cleanup_module()
2213 # virtual interface for OSC and LOV
2215 def __init__(self, db, client_uuid, name, name_override = None):
2216 Module.__init__(self, 'VOSC', db)
2217 if db.get_class() == 'lov':
2218 self.osc = LOV(db, client_uuid, name, name_override)
2220 elif db.get_class() == 'cobd':
2221 self.osc = COBD(db, client_uuid, name, 'obd')
2224 self.osc = OSC(db, client_uuid, name)
2227 return self.osc.get_uuid()
2229 return self.osc.get_name()
2234 def load_module(self):
2235 self.osc.load_module()
2236 def cleanup_module(self):
2237 self.osc.cleanup_module()
2238 def correct_level(self, level, op=None):
2239 return self.osc.correct_level(level, op)
2241 # virtual interface for MDC and LMV
2243 def __init__(self, db, client_uuid, name, name_override = None):
2244 Module.__init__(self, 'VMDC', db)
2245 if db.get_class() == 'lmv':
2246 self.mdc = LMV(db, client_uuid, name)
2247 elif db.get_class() == 'cobd':
2248 self.mdc = COBD(db, client_uuid, name, 'mds')
2250 self.mdc = MDC(db, client_uuid, name)
2252 return self.mdc.uuid
2254 return self.mdc.name
2259 def load_module(self):
2260 self.mdc.load_module()
2261 def cleanup_module(self):
2262 self.mdc.cleanup_module()
2263 def correct_level(self, level, op=None):
2264 return self.mdc.correct_level(level, op)
2266 class ECHO_CLIENT(Module):
2267 def __init__(self,db):
2268 Module.__init__(self, 'ECHO_CLIENT', db)
2269 self.add_lustre_module('obdecho', 'obdecho')
2270 self.obd_uuid = self.db.get_first_ref('obd')
2271 obd = self.db.lookup(self.obd_uuid)
2272 self.uuid = generate_client_uuid(self.name)
2273 self.osc = VOSC(obd, self.uuid, self.name)
2276 if not config.record and is_prepared(self.name):
2279 self.osc.prepare() # XXX This is so cheating. -p
2280 self.info(self.obd_uuid)
2282 lctl.newdev("echo_client", self.name, self.uuid,
2283 setup = self.osc.get_name())
2286 if is_prepared(self.name):
2287 Module.cleanup(self)
2290 def load_module(self):
2291 self.osc.load_module()
2292 Module.load_module(self)
2294 def cleanup_module(self):
2295 Module.cleanup_module(self)
2296 self.osc.cleanup_module()
2298 def correct_level(self, level, op=None):
2301 def generate_client_uuid(name):
2302 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2304 int(random.random() * 1048576),
2305 int(random.random() * 1048576))
2306 return client_uuid[:36]
2308 class Mountpoint(Module):
2309 def __init__(self,db):
2310 Module.__init__(self, 'MTPT', db)
2311 self.path = self.db.get_val('path')
2312 self.fs_uuid = self.db.get_first_ref('filesystem')
2313 fs = self.db.lookup(self.fs_uuid)
2314 self.mds_uuid = fs.get_first_ref('lmv')
2315 if not self.mds_uuid:
2316 self.mds_uuid = fs.get_first_ref('mds')
2317 self.obd_uuid = fs.get_first_ref('obd')
2318 self.mgmt_uuid = fs.get_first_ref('mgmt')
2319 client_uuid = generate_client_uuid(self.name)
2321 ost = self.db.lookup(self.obd_uuid)
2323 panic("no ost: ", self.obd_uuid)
2325 mds = self.db.lookup(self.mds_uuid)
2327 panic("no mds: ", self.mds_uuid)
2329 self.add_lustre_module('mdc', 'mdc')
2330 self.add_lustre_module('lmv', 'lmv')
2331 self.add_lustre_module('llite', 'llite')
2333 self.vosc = VOSC(ost, client_uuid, self.name)
2334 self.vmdc = VMDC(mds, client_uuid, self.name)
2337 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2343 if not config.record and fs_is_mounted(self.path):
2344 log(self.path, "already mounted.")
2348 self.mgmtcli.prepare()
2351 vmdc_name = self.vmdc.get_name()
2353 self.info(self.path, self.mds_uuid, self.obd_uuid)
2354 if config.record or config.lctl_dump:
2355 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2357 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2358 (self.vosc.get_name(), vmdc_name, config.config, self.path)
2359 run("mkdir", self.path)
2364 panic("mount failed:", self.path, ":", string.join(val))
2367 self.info(self.path, self.mds_uuid,self.obd_uuid)
2369 if config.record or config.lctl_dump:
2370 lctl.del_mount_option(local_node_name)
2372 if fs_is_mounted(self.path):
2374 (rc, out) = run("umount", "-f", self.path)
2376 (rc, out) = run("umount", self.path)
2378 raise CommandError('umount', out, rc)
2380 if fs_is_mounted(self.path):
2381 panic("fs is still mounted:", self.path)
2386 self.mgmtcli.cleanup()
2388 def load_module(self):
2390 self.mgmtcli.load_module()
2391 self.vosc.load_module()
2392 Module.load_module(self)
2394 def cleanup_module(self):
2395 Module.cleanup_module(self)
2396 self.vosc.cleanup_module()
2398 self.mgmtcli.cleanup_module()
2400 def correct_level(self, level, op=None):
2403 # ============================================================
2404 # misc query functions
2406 def get_ost_net(self, osd_uuid):
2410 osd = self.lookup(osd_uuid)
2411 node_uuid = osd.get_first_ref('node')
2412 node = self.lookup(node_uuid)
2414 panic("unable to find node for osd_uuid:", osd_uuid,
2415 " node_ref:", node_uuid_)
2416 for net_uuid in node.get_networks():
2417 db = node.lookup(net_uuid)
2418 srv_list.append(Network(db))
2422 # the order of iniitailization is based on level.
2423 def getServiceLevel(self):
2424 type = self.get_class()
2426 if type in ('network',):
2428 elif type in ('routetbl',):
2430 elif type in ('ldlm',):
2432 elif type in ('mgmt',):
2434 elif type in ('osd', 'cobd'):
2436 elif type in ('mdsdev',):
2438 elif type in ('lmv',):
2440 elif type in ('mountpoint', 'echoclient'):
2443 panic("Unknown type: ", type)
2445 if ret < config.minlevel or ret > config.maxlevel:
2450 # return list of services in a profile. list is a list of tuples
2451 # [(level, db_object),]
2452 def getServices(self):
2454 for ref_class, ref_uuid in self.get_all_refs():
2455 servdb = self.lookup(ref_uuid)
2457 level = getServiceLevel(servdb)
2459 list.append((level, servdb))
2461 panic('service not found: ' + ref_uuid)
2467 ############################################################
2469 # FIXME: clean this mess up!
2471 # OSC is no longer in the xml, so we have to fake it.
2472 # this is getting ugly and begging for another refactoring
2473 def get_osc(ost_db, uuid, fs_name):
2474 osc = OSC(ost_db, uuid, fs_name)
2477 def get_mdc(db, uuid, fs_name, mds_uuid):
2478 mds_db = db.lookup(mds_uuid);
2480 error("no mds:", mds_uuid)
2481 mdc = MDC(mds_db, mds_uuid, fs_name)
2484 ############################################################
2485 # routing ("rooting")
2486 # list of (nettype, cluster_id, nid)
2489 def find_local_clusters(node_db):
2490 global local_clusters
2491 for netuuid in node_db.get_networks():
2492 net = node_db.lookup(netuuid)
2494 debug("add_local", netuuid)
2495 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2497 if acceptors.has_key(srv.port):
2498 panic("duplicate port:", srv.port)
2499 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2500 srv.send_mem, srv.recv_mem,
2503 # This node is a gateway.
2505 def node_is_router():
2508 # If there are any routers found in the config, then this will be true
2509 # and all nodes will load kptlrouter.
2511 def node_needs_router():
2512 return needs_router or is_router
2514 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2515 # Currently, these local routes are only added to kptlrouter route
2516 # table if they are needed to connect to a specific server. This
2517 # should be changed so all available routes are loaded, and the
2518 # ptlrouter can make all the decisions.
2521 def find_local_routes(lustre):
2522 """ Scan the lustre config looking for routers . Build list of
2524 global local_routes, needs_router
2526 list = lustre.lookup_class('node')
2528 if router.get_val_int('router', 0):
2530 for (local_type, local_cluster_id, local_nid) in local_clusters:
2532 for netuuid in router.get_networks():
2533 db = router.lookup(netuuid)
2534 if (local_type == db.get_val('nettype') and
2535 local_cluster_id == db.get_val('clusterid')):
2536 gw = db.get_val('nid')
2539 debug("find_local_routes: gw is", gw)
2540 for route in router.get_local_routes(local_type, gw):
2541 local_routes.append(route)
2542 debug("find_local_routes:", local_routes)
2545 def choose_local_server(srv_list):
2546 for srv in srv_list:
2547 if local_cluster(srv.net_type, srv.cluster_id):
2550 def local_cluster(net_type, cluster_id):
2551 for cluster in local_clusters:
2552 if net_type == cluster[0] and cluster_id == cluster[1]:
2556 def local_interface(net_type, cluster_id, nid):
2557 for cluster in local_clusters:
2558 if (net_type == cluster[0] and cluster_id == cluster[1]
2559 and nid == cluster[2]):
2563 def find_route(srv_list):
2565 frm_type = local_clusters[0][0]
2566 for srv in srv_list:
2567 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2568 to_type = srv.net_type
2570 cluster_id = srv.cluster_id
2571 debug ('looking for route to', to_type, to)
2572 for r in local_routes:
2573 debug("find_route: ", r)
2574 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2575 result.append((srv, r))
2578 def get_active_target(db):
2579 target_uuid = db.getUUID()
2580 target_name = db.getName()
2581 node_name = get_select(target_name)
2583 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2585 tgt_dev_uuid = db.get_first_ref('active')
2588 def get_server_by_nid_uuid(db, nid_uuid):
2589 for n in db.lookup_class("network"):
2591 if net.nid_uuid == nid_uuid:
2595 ############################################################
2599 type = db.get_class()
2600 debug('Service:', type, db.getName(), db.getUUID())
2605 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2606 elif type == 'network':
2608 elif type == 'routetbl':
2612 elif type == 'cobd':
2613 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2614 elif type == 'mdsdev':
2616 elif type == 'mountpoint':
2618 elif type == 'echoclient':
2620 elif type == 'mgmt':
2625 panic ("unknown service type:", type)
2629 # Prepare the system to run lustre using a particular profile
2630 # in a the configuration.
2631 # * load & the modules
2632 # * setup networking for the current node
2633 # * make sure partitions are in place and prepared
2634 # * initialize devices with lctl
2635 # Levels is important, and needs to be enforced.
2636 def for_each_profile(db, prof_list, operation):
2637 for prof_uuid in prof_list:
2638 prof_db = db.lookup(prof_uuid)
2640 panic("profile:", profile, "not found.")
2641 services = getServices(prof_db)
2644 def magic_get_osc(db, rec, lov):
2646 lov_uuid = lov.get_uuid()
2647 lov_name = lov.osc.fs_name
2649 lov_uuid = rec.getAttribute('lov_uuidref')
2650 # FIXME: better way to find the mountpoint?
2651 filesystems = db.root_node.getElementsByTagName('filesystem')
2653 for fs in filesystems:
2654 ref = fs.getElementsByTagName('obd_ref')
2655 if ref[0].getAttribute('uuidref') == lov_uuid:
2656 fsuuid = fs.getAttribute('uuid')
2660 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2662 mtpts = db.root_node.getElementsByTagName('mountpoint')
2665 ref = fs.getElementsByTagName('filesystem_ref')
2666 if ref[0].getAttribute('uuidref') == fsuuid:
2667 lov_name = fs.getAttribute('name')
2671 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2673 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2675 ost_uuid = rec.getAttribute('ost_uuidref')
2676 obd = db.lookup(ost_uuid)
2679 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
2681 osc = get_osc(obd, lov_uuid, lov_name)
2683 panic('osc not found:', obd_uuid)
2686 # write logs for update records. sadly, logs of all types -- and updates in
2687 # particular -- are something of an afterthought. lconf needs rewritten with
2688 # these as core concepts. so this is a pretty big hack.
2689 def process_update_record(db, update, lov):
2690 for rec in update.childNodes:
2691 if rec.nodeType != rec.ELEMENT_NODE:
2694 log("found "+rec.nodeName+" record in update version " +
2695 str(update.getAttribute('version')))
2697 lov_uuid = rec.getAttribute('lov_uuidref')
2698 ost_uuid = rec.getAttribute('ost_uuidref')
2699 index = rec.getAttribute('index')
2700 gen = rec.getAttribute('generation')
2702 if not lov_uuid or not ost_uuid or not index or not gen:
2703 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
2706 tmplov = db.lookup(lov_uuid)
2708 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
2709 lov_name = tmplov.getName()
2711 lov_name = lov.osc.name
2713 # ------------------------------------------------------------- add
2714 if rec.nodeName == 'add':
2716 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2719 osc = magic_get_osc(db, rec, lov)
2722 # Only ignore connect failures with --force, which
2723 # isn't implemented here yet.
2724 osc.prepare(ignore_connect_failure=0)
2725 except CommandError, e:
2726 print "Error preparing OSC %s\n" % osc.uuid
2729 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2731 # ------------------------------------------------------ deactivate
2732 elif rec.nodeName == 'deactivate':
2736 osc = magic_get_osc(db, rec, lov)
2740 except CommandError, e:
2741 print "Error deactivating OSC %s\n" % osc.uuid
2744 # ---------------------------------------------------------- delete
2745 elif rec.nodeName == 'delete':
2749 osc = magic_get_osc(db, rec, lov)
2755 except CommandError, e:
2756 print "Error cleaning up OSC %s\n" % osc.uuid
2759 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
2761 def process_updates(db, log_device, log_name, lov = None):
2762 updates = db.root_node.getElementsByTagName('update')
2764 if not u.childNodes:
2765 log("ignoring empty update record (version " +
2766 str(u.getAttribute('version')) + ")")
2769 version = u.getAttribute('version')
2770 real_name = "%s-%s" % (log_name, version)
2771 lctl.clear_log(log_device, real_name)
2772 lctl.record(log_device, real_name)
2774 process_update_record(db, u, lov)
2778 def doWriteconf(services):
2782 if s[1].get_class() == 'mdsdev':
2783 n = newService(s[1])
2786 def doSetup(services):
2791 n = newService(s[1])
2793 slist.append((n.level, n))
2796 nl = n[1].correct_level(n[0])
2797 nlist.append((nl, n[1]))
2802 def doModules(services):
2806 n = newService(s[1])
2809 def doCleanup(services):
2814 n = newService(s[1])
2816 slist.append((n.level, n))
2819 nl = n[1].correct_level(n[0])
2820 nlist.append((nl, n[1]))
2824 if n[1].safe_to_clean():
2827 def doUnloadModules(services):
2832 n = newService(s[1])
2833 if n.safe_to_clean_modules():
2838 def doHost(lustreDB, hosts):
2839 global is_router, local_node_name
2842 node_db = lustreDB.lookup_name(h, 'node')
2846 panic('No host entry found.')
2848 local_node_name = node_db.get_val('name', 0)
2849 is_router = node_db.get_val_int('router', 0)
2850 lustre_upcall = node_db.get_val('lustreUpcall', '')
2851 portals_upcall = node_db.get_val('portalsUpcall', '')
2852 timeout = node_db.get_val_int('timeout', 0)
2853 ptldebug = node_db.get_val('ptldebug', '')
2854 subsystem = node_db.get_val('subsystem', '')
2856 find_local_clusters(node_db)
2858 find_local_routes(lustreDB)
2860 # Two step process: (1) load modules, (2) setup lustre
2861 # if not cleaning, load modules first.
2862 prof_list = node_db.get_refs('profile')
2864 if config.write_conf:
2866 for_each_profile(node_db, prof_list, doModules)
2868 for_each_profile(node_db, prof_list, doWriteconf)
2869 for_each_profile(node_db, prof_list, doUnloadModules)
2871 elif config.recover:
2872 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2873 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2874 "--client_uuid <UUID> --conn_uuid <UUID>")
2875 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2877 elif config.cleanup:
2879 # the command line can override this value
2881 # ugly hack, only need to run lctl commands for --dump
2882 if config.lctl_dump or config.record:
2883 for_each_profile(node_db, prof_list, doCleanup)
2886 sys_set_timeout(timeout)
2887 sys_set_ptldebug(ptldebug)
2888 sys_set_subsystem(subsystem)
2889 sys_set_lustre_upcall(lustre_upcall)
2890 sys_set_portals_upcall(portals_upcall)
2892 for_each_profile(node_db, prof_list, doCleanup)
2893 for_each_profile(node_db, prof_list, doUnloadModules)
2897 # ugly hack, only need to run lctl commands for --dump
2898 if config.lctl_dump or config.record:
2899 sys_set_timeout(timeout)
2900 sys_set_lustre_upcall(lustre_upcall)
2901 for_each_profile(node_db, prof_list, doSetup)
2905 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2906 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2908 for_each_profile(node_db, prof_list, doModules)
2910 sys_set_debug_path()
2911 sys_set_ptldebug(ptldebug)
2912 sys_set_subsystem(subsystem)
2913 script = config.gdb_script
2914 run(lctl.lctl, ' modules >', script)
2916 log ("The GDB module script is in", script)
2917 # pause, so user has time to break and
2920 sys_set_timeout(timeout)
2921 sys_set_lustre_upcall(lustre_upcall)
2922 sys_set_portals_upcall(portals_upcall)
2924 for_each_profile(node_db, prof_list, doSetup)
2927 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2928 tgt = lustreDB.lookup(tgt_uuid)
2930 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2931 new_uuid = get_active_target(tgt)
2933 raise Lustre.LconfError("doRecovery: no active target found for: " +
2935 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2937 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2939 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2941 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2944 lctl.disconnect(oldnet)
2945 except CommandError, e:
2946 log("recover: disconnect", nid_uuid, "failed: ")
2951 except CommandError, e:
2952 log("recover: connect failed")
2955 lctl.recover(client_uuid, net.nid_uuid)
2958 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2959 base = os.path.dirname(cmd)
2960 if development_mode():
2961 if not config.lustre:
2962 debug('using objdir module paths')
2963 config.lustre = (os.path.join(base, ".."))
2964 # normalize the portals dir, using command line arg if set
2966 portals_dir = config.portals
2967 dir = os.path.join(config.lustre, portals_dir)
2968 config.portals = dir
2969 debug('config.portals', config.portals)
2970 elif config.lustre and config.portals:
2972 # if --lustre and --portals, normalize portals
2973 # can ignore POTRALS_DIR here, since it is probly useless here
2974 config.portals = os.path.join(config.lustre, config.portals)
2975 debug('config.portals B', config.portals)
2977 def sysctl(path, val):
2978 debug("+ sysctl", path, val)
2982 fp = open(os.path.join('/proc/sys', path), 'w')
2989 def sys_set_debug_path():
2990 sysctl('portals/debug_path', config.debug_path)
2992 def sys_set_lustre_upcall(upcall):
2993 # the command overrides the value in the node config
2994 if config.lustre_upcall:
2995 upcall = config.lustre_upcall
2997 upcall = config.upcall
2999 lctl.set_lustre_upcall(upcall)
3001 def sys_set_portals_upcall(upcall):
3002 # the command overrides the value in the node config
3003 if config.portals_upcall:
3004 upcall = config.portals_upcall
3006 upcall = config.upcall
3008 sysctl('portals/upcall', upcall)
3010 def sys_set_timeout(timeout):
3011 # the command overrides the value in the node config
3012 if config.timeout and config.timeout > 0:
3013 timeout = config.timeout
3014 if timeout != None and timeout > 0:
3015 lctl.set_timeout(timeout)
3017 def sys_tweak_socknal ():
3018 if config.single_socket:
3019 sysctl("socknal/typed", 0)
3021 def sys_optimize_elan ():
3022 procfiles = ["/proc/elan/config/eventint_punt_loops",
3023 "/proc/qsnet/elan3/config/eventint_punt_loops",
3024 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3026 if os.access(p, os.R_OK):
3027 run ("echo 1 > " + p)
3029 def sys_set_ptldebug(ptldebug):
3031 ptldebug = config.ptldebug
3034 val = eval(ptldebug, ptldebug_names)
3035 val = "0x%x" % (val)
3036 sysctl('portals/debug', val)
3037 except NameError, e:
3040 def sys_set_subsystem(subsystem):
3041 if config.subsystem:
3042 subsystem = config.subsystem
3045 val = eval(subsystem, subsystem_names)
3046 val = "0x%x" % (val)
3047 sysctl('portals/subsystem_debug', val)
3048 except NameError, e:
3051 def sys_set_netmem_max(path, max):
3052 debug("setting", path, "to at least", max)
3060 fp = open(path, 'w')
3061 fp.write('%d\n' %(max))
3065 def sys_make_devices():
3066 if not os.access('/dev/portals', os.R_OK):
3067 run('mknod /dev/portals c 10 240')
3068 if not os.access('/dev/obd', os.R_OK):
3069 run('mknod /dev/obd c 10 241')
3072 # Add dir to the global PATH, if not already there.
3073 def add_to_path(new_dir):
3074 syspath = string.split(os.environ['PATH'], ':')
3075 if new_dir in syspath:
3077 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3079 def default_debug_path():
3080 path = '/tmp/lustre-log'
3081 if os.path.isdir('/r'):
3086 def default_gdb_script():
3087 script = '/tmp/ogdb'
3088 if os.path.isdir('/r'):
3089 return '/r' + script
3094 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3095 # ensure basic elements are in the system path
3096 def sanitise_path():
3097 for dir in DEFAULT_PATH:
3100 # global hack for the --select handling
3102 def init_select(args):
3103 # args = [service=nodeA,service2=nodeB service3=nodeC]
3106 list = string.split(arg, ',')
3108 srv, node = string.split(entry, '=')
3109 tgt_select[srv] = node
3111 def get_select(srv):
3112 if tgt_select.has_key(srv):
3113 return tgt_select[srv]
3117 FLAG = Lustre.Options.FLAG
3118 PARAM = Lustre.Options.PARAM
3119 INTPARAM = Lustre.Options.INTPARAM
3120 PARAMLIST = Lustre.Options.PARAMLIST
3122 ('verbose,v', "Print system commands as they are run"),
3123 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3124 ('config', "Cluster config name used for LDAP query", PARAM),
3125 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3126 ('node', "Load config for <nodename>", PARAM),
3127 ('cleanup,d', "Cleans up config. (Shutdown)"),
3128 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3130 ('single_socket', "socknal option: only use one socket instead of bundle",
3132 ('failover',"""Used to shut down without saving state.
3133 This will allow this node to "give up" a service to a
3134 another node for failover purposes. This will not
3135 be a clean shutdown.""",
3137 ('gdb', """Prints message after creating gdb module script
3138 and sleeps for 5 seconds."""),
3139 ('noexec,n', """Prints the commands and steps that will be run for a
3140 config without executing them. This can used to check if a
3141 config file is doing what it should be doing"""),
3142 ('nomod', "Skip load/unload module step."),
3143 ('nosetup', "Skip device setup/cleanup step."),
3144 ('reformat', "Reformat all devices (without question)"),
3145 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3146 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3147 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3149 ('write_conf', "Save all the client config information on mds."),
3150 ('record', "Write config information on mds."),
3151 ('record_log', "Name of config record log.", PARAM),
3152 ('record_device', "MDS device name that will record the config commands",
3154 ('minlevel', "Minimum level of services to configure/cleanup",
3156 ('maxlevel', """Maximum level of services to configure/cleanup
3157 Levels are aproximatly like:
3162 70 - mountpoint, echo_client, osc, mdc, lov""",
3164 ('lustre', """Base directory of lustre sources. This parameter will
3165 cause lconf to load modules from a source tree.""", PARAM),
3166 ('portals', """Portals source directory. If this is a relative path,
3167 then it is assumed to be relative to lustre. """, PARAM),
3168 ('timeout', "Set recovery timeout", INTPARAM),
3169 ('upcall', "Set both portals and lustre upcall script", PARAM),
3170 ('lustre_upcall', "Set lustre upcall script", PARAM),
3171 ('portals_upcall', "Set portals upcall script", PARAM),
3172 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3173 ('ptldebug', "Set the portals debug level", PARAM),
3174 ('subsystem', "Set the portals debug subsystem", PARAM),
3175 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3176 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3177 # Client recovery options
3178 ('recover', "Recover a device"),
3179 ('group', "The group of devices to configure or cleanup", PARAM),
3180 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3181 ('client_uuid', "The failed client (required for recovery)", PARAM),
3182 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3184 ('inactive', """The name of an inactive service, to be ignored during
3185 mounting (currently OST-only). Can be repeated.""",
3190 global lctl, config, toplustreDB, CONFIG_FILE
3192 # in the upcall this is set to SIG_IGN
3193 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3195 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3197 config, args = cl.parse(sys.argv[1:])
3198 except Lustre.OptionError, e:
3202 setupModulePath(sys.argv[0])
3204 host = socket.gethostname()
3206 # the PRNG is normally seeded with time(), which is not so good for starting
3207 # time-synchronized clusters
3208 input = open('/dev/urandom', 'r')
3210 print 'Unable to open /dev/urandom!'
3212 seed = input.read(32)
3218 init_select(config.select)
3221 # allow config to be fetched via HTTP, but only with python2
3222 if sys.version[0] != '1' and args[0].startswith('http://'):
3225 config_file = urllib2.urlopen(args[0])
3226 except (urllib2.URLError, socket.error), err:
3227 if hasattr(err, 'args'):
3229 print "Could not access '%s': %s" %(args[0], err)
3231 elif not os.access(args[0], os.R_OK):
3232 print 'File not found or readable:', args[0]
3236 config_file = open(args[0], 'r')
3238 dom = xml.dom.minidom.parse(config_file)
3240 panic("%s does not appear to be a config file." % (args[0]))
3241 sys.exit(1) # make sure to die here, even in debug mode.
3243 CONFIG_FILE = args[0]
3244 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3245 if not config.config:
3246 config.config = os.path.basename(args[0])# use full path?
3247 if config.config[-4:] == '.xml':
3248 config.config = config.config[:-4]
3249 elif config.ldapurl:
3250 if not config.config:
3251 panic("--ldapurl requires --config name")
3252 dn = "config=%s,fs=lustre" % (config.config)
3253 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3254 elif config.ptldebug or config.subsystem:
3255 sys_set_ptldebug(None)
3256 sys_set_subsystem(None)
3259 print 'Missing config file or ldap URL.'
3260 print 'see lconf --help for command summary'
3263 toplustreDB = lustreDB
3265 ver = lustreDB.get_version()
3267 panic("No version found in config data, please recreate.")
3268 if ver != Lustre.CONFIG_VERSION:
3269 panic("Config version", ver, "does not match lconf version",
3270 Lustre.CONFIG_VERSION)
3274 node_list.append(config.node)
3277 node_list.append(host)
3278 node_list.append('localhost')
3280 debug("configuring for host: ", node_list)
3283 config.debug_path = config.debug_path + '-' + host
3284 config.gdb_script = config.gdb_script + '-' + host
3286 lctl = LCTLInterface('lctl')
3288 if config.lctl_dump:
3289 lctl.use_save_file(config.lctl_dump)
3292 if not (config.record_device and config.record_log):
3293 panic("When recording, both --record_log and --record_device must be specified.")
3294 lctl.clear_log(config.record_device, config.record_log)
3295 lctl.record(config.record_device, config.record_log)
3297 doHost(lustreDB, node_list)
3299 if not config.record:
3304 process_updates(db, config.record_device, config.record_log)
3306 if __name__ == "__main__":
3309 except Lustre.LconfError, e:
3311 # traceback.print_exc(file=sys.stdout)
3313 except CommandError, e:
3317 if first_cleanup_error:
3318 sys.exit(first_cleanup_error)