3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = '../portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
93 "console" : (1 << 25),
99 "undefined" : (1 << 0),
104 "obdclass" : (1 << 5),
109 "portals" : (1 << 10),
111 "pinger" : (1 << 12),
112 "filter" : (1 << 13),
117 "ptlrouter" : (1 << 18),
121 "confobd" : (1 << 22),
128 first_cleanup_error = 0
129 def cleanup_error(rc):
130 global first_cleanup_error
131 if not first_cleanup_error:
132 first_cleanup_error = rc
134 # ============================================================
135 # debugging and error funcs
137 def fixme(msg = "this feature"):
138 raise Lustre.LconfError, msg + ' not implemented yet.'
141 msg = string.join(map(str,args))
142 if not config.noexec:
143 raise Lustre.LconfError(msg)
148 msg = string.join(map(str,args))
153 print string.strip(s)
157 msg = string.join(map(str,args))
160 # ack, python's builtin int() does not support '0x123' syntax.
161 # eval can do it, although what a hack!
165 return eval(s, {}, {})
168 except SyntaxError, e:
169 raise ValueError("not a number")
171 raise ValueError("not a number")
173 # ============================================================
174 # locally defined exceptions
175 class CommandError (exceptions.Exception):
176 def __init__(self, cmd_name, cmd_err, rc=None):
177 self.cmd_name = cmd_name
178 self.cmd_err = cmd_err
183 if type(self.cmd_err) == types.StringType:
185 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
187 print "! %s: %s" % (self.cmd_name, self.cmd_err)
188 elif type(self.cmd_err) == types.ListType:
190 print "! %s (error %d):" % (self.cmd_name, self.rc)
192 print "! %s:" % (self.cmd_name)
193 for s in self.cmd_err:
194 print "> %s" %(string.strip(s))
199 # ============================================================
200 # handle daemons, like the acceptor
202 """ Manage starting and stopping a daemon. Assumes daemon manages
203 it's own pid file. """
205 def __init__(self, cmd):
211 log(self.command, "already running.")
213 self.path = find_prog(self.command)
215 panic(self.command, "not found.")
216 ret, out = runcmd(self.path +' '+ self.command_line())
218 raise CommandError(self.path, out, ret)
222 pid = self.read_pidfile()
225 log ("killing process", pid)
228 log("was unable to find pid of " + self.command)
230 log("unable to kill", self.command, e)
231 time.sleep(5) # let daemon die
233 log("unable to kill", self.command)
236 pid = self.read_pidfile()
242 log("was unable to find pid of " + self.command)
249 def read_pidfile(self):
251 fp = open(self.pidfile(), 'r')
261 def clean_pidfile(self):
262 """ Remove a stale pidfile """
263 log("removing stale pidfile:", self.pidfile())
265 os.unlink(self.pidfile())
267 log(self.pidfile(), e)
269 class AcceptorHandler(DaemonHandler):
270 def __init__(self, port, net_type):
271 DaemonHandler.__init__(self, "acceptor")
276 return "/var/run/%s-%d.pid" % (self.command, self.port)
278 def command_line(self):
279 return string.join(map(str,(self.flags, self.port)))
283 # start the acceptors
285 if config.lctl_dump or config.record:
287 for port in acceptors.keys():
288 daemon = acceptors[port]
289 if not daemon.running():
292 def run_one_acceptor(port):
293 if config.lctl_dump or config.record:
295 if acceptors.has_key(port):
296 daemon = acceptors[port]
297 if not daemon.running():
300 panic("run_one_acceptor: No acceptor defined for port:", port)
302 def stop_acceptor(port):
303 if acceptors.has_key(port):
304 daemon = acceptors[port]
309 # ============================================================
310 # handle lctl interface
313 Manage communication with lctl
316 def __init__(self, cmd):
318 Initialize close by finding the lctl binary.
320 self.lctl = find_prog(cmd)
322 self.record_device = ''
325 debug('! lctl not found')
328 raise CommandError('lctl', "unable to find lctl binary.")
330 def use_save_file(self, file):
331 self.save_file = file
333 def record(self, dev_name, logname):
334 log("Recording log", logname, "on", dev_name)
335 self.record_device = dev_name
336 self.record_log = logname
338 def end_record(self):
339 log("End recording log", self.record_log, "on", self.record_device)
340 self.record_device = None
341 self.record_log = None
343 def set_nonblock(self, fd):
344 fl = fcntl.fcntl(fd, F_GETFL)
345 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
350 the cmds are written to stdin of lctl
351 lctl doesn't return errors when run in script mode, so
353 should modify command line to accept multiple commands, or
354 create complex command line options
358 cmds = '\n dump ' + self.save_file + '\n' + cmds
359 elif self.record_device:
363 %s""" % (self.record_device, self.record_log, cmds)
365 debug("+", cmd_line, cmds)
366 if config.noexec: return (0, [])
368 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
369 child.tochild.write(cmds + "\n")
370 child.tochild.close()
371 # print "LCTL:", cmds
373 # From "Python Cookbook" from O'Reilly
374 outfile = child.fromchild
375 outfd = outfile.fileno()
376 self.set_nonblock(outfd)
377 errfile = child.childerr
378 errfd = errfile.fileno()
379 self.set_nonblock(errfd)
381 outdata = errdata = ''
384 ready = select.select([outfd,errfd],[],[]) # Wait for input
385 if outfd in ready[0]:
386 outchunk = outfile.read()
387 if outchunk == '': outeof = 1
388 outdata = outdata + outchunk
389 if errfd in ready[0]:
390 errchunk = errfile.read()
391 if errchunk == '': erreof = 1
392 errdata = errdata + errchunk
393 if outeof and erreof: break
394 # end of "borrowed" code
397 if os.WIFEXITED(ret):
398 rc = os.WEXITSTATUS(ret)
401 if rc or len(errdata):
402 raise CommandError(self.lctl, errdata, rc)
405 def runcmd(self, *args):
407 run lctl using the command line
409 cmd = string.join(map(str,args))
410 debug("+", self.lctl, cmd)
411 rc, out = run(self.lctl, cmd)
413 raise CommandError(self.lctl, out, rc)
416 def clear_log(self, dev, log):
417 """ clear an existing log """
422 quit """ % (dev, log)
425 def root_squash(self, name, uid, nid):
429 quit""" % (name, uid, nid)
432 def network(self, net, nid):
437 quit """ % (net, nid)
441 def add_interface(self, net, ip, netmask = ""):
442 """ add an interface """
446 quit """ % (net, ip, netmask)
449 # delete an interface
450 def del_interface(self, net, ip):
451 """ delete an interface """
458 # create a new connection
459 def add_uuid(self, net_type, uuid, nid):
460 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
463 def add_peer(self, net_type, nid, hostaddr, port):
464 if net_type in ('tcp','openib','ra') and not config.lctl_dump:
469 nid, hostaddr, port )
471 elif net_type in ('iib',) and not config.lctl_dump:
478 elif net_type in ('vib',) and not config.lctl_dump:
486 def connect(self, srv):
487 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
488 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
490 hostaddr = string.split(srv.hostaddr[0], '/')[0]
491 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
494 def recover(self, dev_name, new_conn):
497 recover %s""" %(dev_name, new_conn)
500 # add a route to a range
501 def add_route(self, net, gw, lo, hi):
509 except CommandError, e:
513 def del_route(self, net, gw, lo, hi):
518 quit """ % (net, gw, lo, hi)
521 # add a route to a host
522 def add_route_host(self, net, uuid, gw, tgt):
523 self.add_uuid(net, uuid, tgt)
531 except CommandError, e:
535 # add a route to a range
536 def del_route_host(self, net, uuid, gw, tgt):
542 quit """ % (net, gw, tgt)
546 def del_peer(self, net_type, nid, hostaddr):
547 if net_type in ('tcp',) and not config.lctl_dump:
551 del_peer %s %s single_share
555 elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump:
559 del_peer %s single_share
564 # disconnect one connection
565 def disconnect(self, srv):
566 self.del_uuid(srv.nid_uuid)
567 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
569 hostaddr = string.split(srv.hostaddr[0], '/')[0]
570 self.del_peer(srv.net_type, srv.nid, hostaddr)
572 def del_uuid(self, uuid):
580 def disconnectAll(self, net):
588 def attach(self, type, name, uuid):
591 quit""" % (type, name, uuid)
594 def detach(self, name):
601 def set_security(self, name, key, value):
605 quit""" % (name, key, value)
608 def setup(self, name, setup = ""):
612 quit""" % (name, setup)
615 def add_conn(self, name, conn_uuid):
619 quit""" % (name, conn_uuid)
622 def start(self, name, conf_name):
626 quit""" % (name, conf_name)
629 # create a new device with lctl
630 def newdev(self, type, name, uuid, setup = ""):
632 self.attach(type, name, uuid);
634 self.setup(name, setup)
635 except CommandError, e:
636 self.cleanup(name, uuid, 0)
640 def cleanup(self, name, uuid, force, failover = 0):
641 if failover: force = 1
647 quit""" % (name, ('', 'force')[force],
648 ('', 'failover')[failover])
652 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
653 stripe_sz, stripe_off, pattern):
656 lov_setup %s %d %d %d %s
657 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
660 # add an OSC to a LOV
661 def lov_add_osc(self, name, ost_uuid, index, gen):
663 lov_modify_tgts add %s %s %s %s
664 quit""" % (name, ost_uuid, index, gen)
667 # delete an OSC from a LOV
668 def lov_del_osc(self, name, ost_uuid, index, gen):
670 lov_modify_tgts del %s %s %s %s
671 quit""" % (name, ost_uuid, index, gen)
675 def deactivate(self, name):
683 def lmv_setup(self, name, uuid, desc_uuid):
687 quit""" % (name, uuid, desc_uuid)
690 # add an MDC to an LMV
691 def lmv_add_mdc(self, lmv_name, mdt_uuid):
693 lmv_modify_tgts add %s %s
694 quit""" % (lmv_name, mdt_uuid)
698 def dump(self, dump_file):
701 quit""" % (dump_file)
704 # get list of devices
705 def device_list(self):
706 devices = '/proc/fs/lustre/devices'
708 if os.access(devices, os.R_OK):
710 fp = open(devices, 'r')
718 def lustre_version(self):
719 rc, out = self.runcmd('version')
723 def mount_option(self, profile, osc, mdc, gkc):
725 mount_option %s %s %s %s
726 quit""" % (profile, osc, mdc, gkc)
729 # delete mount options
730 def del_mount_option(self, profile):
736 def set_timeout(self, timeout):
742 def set_lustre_upcall(self, upcall):
747 # ============================================================
748 # Various system-level functions
749 # (ideally moved to their own module)
751 # Run a command and return the output and status.
752 # stderr is sent to /dev/null, could use popen3 to
753 # save it if necessary
756 if config.noexec: return (0, [])
757 f = os.popen(cmd + ' 2>&1')
767 cmd = string.join(map(str,args))
770 # Run a command in the background.
771 def run_daemon(*args):
772 cmd = string.join(map(str,args))
774 if config.noexec: return 0
775 f = os.popen(cmd + ' 2>&1')
783 # Determine full path to use for an external command
784 # searches dirname(argv[0]) first, then PATH
786 syspath = string.split(os.environ['PATH'], ':')
787 cmdpath = os.path.dirname(sys.argv[0])
788 syspath.insert(0, cmdpath);
790 syspath.insert(0, os.path.join(config.portals, 'utils/'))
792 prog = os.path.join(d,cmd)
793 if os.access(prog, os.X_OK):
797 # Recursively look for file starting at base dir
798 def do_find_file(base, mod):
799 fullname = os.path.join(base, mod)
800 if os.access(fullname, os.R_OK):
802 for d in os.listdir(base):
803 dir = os.path.join(base,d)
804 if os.path.isdir(dir):
805 module = do_find_file(dir, mod)
809 # is the path a block device?
816 return stat.S_ISBLK(s[stat.ST_MODE])
818 # find the journal device from mkfs options
824 while i < len(x) - 1:
825 if x[i] == '-J' and x[i+1].startswith('device='):
831 # build fs according to type
833 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
839 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
841 # devsize is in 1k, and fs block count is in 4k
842 block_cnt = devsize/4
844 if fstype in ('ext3', 'extN', 'ldiskfs'):
845 # ext3 journal size is in megabytes
846 # but don't set jsize if mkfsoptions indicates a separate journal device
847 if jsize == 0 and jdev(mkfsoptions) == '':
849 if not is_block(dev):
850 ret, out = runcmd("ls -l %s" %dev)
851 devsize = int(string.split(out[0])[4]) / 1024
853 # sfdisk works for symlink, hardlink, and realdev
854 ret, out = runcmd("sfdisk -s %s" %dev)
856 devsize = int(out[0])
858 # sfdisk -s will fail for too large block device,
859 # then, read the size of partition from /proc/partitions
861 # get the realpath of the device
862 # it may be the real device, such as /dev/hda7
863 # or the hardlink created via mknod for a device
864 if 'realpath' in dir(os.path):
865 real_dev = os.path.realpath(dev)
869 while os.path.islink(real_dev) and (link_count < 20):
870 link_count = link_count + 1
871 dev_link = os.readlink(real_dev)
872 if os.path.isabs(dev_link):
875 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
877 panic("Entountered too many symbolic links resolving block device:", dev)
879 # get the major and minor number of the realpath via ls
880 # it seems python(os.stat) does not return
881 # the st_rdev member of the stat structure
882 ret, out = runcmd("ls -l %s" %real_dev)
883 major = string.split(string.split(out[0])[4], ",")[0]
884 minor = string.split(out[0])[5]
886 # get the devsize from /proc/partitions with the major and minor number
887 ret, out = runcmd("cat /proc/partitions")
890 if string.split(line)[0] == major and string.split(line)[1] == minor:
891 devsize = int(string.split(line)[2])
894 if devsize > 1024 * 1024:
895 jsize = ((devsize / 102400) * 4)
898 if jsize: jopt = "-J size=%d" %(jsize,)
899 if isize: iopt = "-I %d" %(isize,)
900 mkfs = 'mkfs.ext2 -j -b 4096 '
901 if not isblock or config.force:
903 if jdev(mkfsoptions) != '':
904 jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
906 jmkfs = jmkfs + '-F '
907 jmkfs = jmkfs + jdev(mkfsoptions)
908 (ret, out) = run (jmkfs)
910 panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
911 elif fstype == 'reiserfs':
912 # reiserfs journal size is in blocks
913 if jsize: jopt = "--journal_size %d" %(jsize,)
914 mkfs = 'mkreiserfs -ff'
916 panic('unsupported fs type: ', fstype)
918 if config.mkfsoptions != None:
919 mkfs = mkfs + ' ' + config.mkfsoptions
920 if mkfsoptions != None:
921 mkfs = mkfs + ' ' + mkfsoptions
922 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
924 panic("Unable to build fs:", dev, string.join(out))
925 # enable hash tree indexing on fsswe
926 if fstype in ('ext3', 'extN', 'ldiskfs'):
927 htree = 'echo "feature FEATURE_C5" | debugfs -w'
928 (ret, out) = run (htree, dev)
930 panic("Unable to enable htree:", dev)
932 # some systems use /dev/loopN, some /dev/loop/N
936 if not os.access(loop + str(0), os.R_OK):
938 if not os.access(loop + str(0), os.R_OK):
939 panic ("can't access loop devices")
942 # find loop device assigned to the file
943 def find_assigned_loop(file):
945 for n in xrange(0, MAX_LOOP_DEVICES):
947 if os.access(dev, os.R_OK):
948 (stat, out) = run('losetup', dev)
949 if out and stat == 0:
950 m = re.search(r'\((.*)\)', out[0])
951 if m and file == m.group(1):
955 # find free loop device
956 def find_free_loop(file):
959 # find next free loop
960 for n in xrange(0, MAX_LOOP_DEVICES):
962 if os.access(dev, os.R_OK):
963 (stat, out) = run('losetup', dev)
968 # create file if necessary and assign the first free loop device
969 def init_loop(file, size, fstype, journal_size, inode_size,
970 mkfsoptions, reformat, autoformat, backfstype, backfile):
973 realfstype = backfstype
974 if is_block(backfile):
975 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
976 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
982 dev = find_assigned_loop(realfile)
984 print 'WARNING: file', realfile, 'already mapped to', dev
987 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
988 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
990 panic("Unable to create backing store:", realfile)
991 mkfs(realfile, size, realfstype, journal_size, inode_size,
992 mkfsoptions, isblock=0)
994 dev = find_free_loop(realfile)
996 print "attach " + realfile + " <-> " + dev
997 run('losetup', dev, realfile)
1000 print "out of loop devices"
1003 # undo loop assignment
1004 def clean_loop(dev, fstype, backfstype, backdev):
1005 if fstype == 'smfs':
1009 if not is_block(realfile):
1010 dev = find_assigned_loop(realfile)
1012 print "detach " + dev + " <-> " + realfile
1013 ret, out = run('losetup -d', dev)
1015 log('unable to clean loop device', dev, 'for file', realfile)
1018 # finilizes passed device
1019 def clean_dev(dev, fstype, backfstype, backdev):
1020 if fstype == 'smfs' or not is_block(dev):
1021 clean_loop(dev, fstype, backfstype, backdev)
1023 # determine if dev is formatted as a <fstype> filesystem
1024 def need_format(fstype, dev):
1025 # FIXME don't know how to implement this
1028 # initialize a block device if needed
1029 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
1030 inode_size, mkfsoptions, backfstype, backdev):
1034 if fstype == 'smfs' or not is_block(dev):
1035 dev = init_loop(dev, size, fstype, journal_size, inode_size,
1036 mkfsoptions, reformat, autoformat, backfstype, backdev)
1037 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
1038 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
1041 # panic("device:", dev,
1042 # "not prepared, and autoformat is not set.\n",
1043 # "Rerun with --reformat option to format ALL filesystems")
1048 """lookup IP address for an interface"""
1049 rc, out = run("/sbin/ifconfig", iface)
1052 addr = string.split(out[1])[1]
1053 ip = string.split(addr, ':')[1]
1056 def def_mount_options(fstype, target):
1057 """returns deafult mount options for passed fstype and target (mds, ost)"""
1058 if fstype == 'ext3' or fstype == 'ldiskfs':
1059 mountfsoptions = "errors=remount-ro"
1060 if target == 'ost' and sys_get_branch() == '2.4':
1061 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
1062 if target == 'ost' and sys_get_branch() == '2.6':
1063 mountfsoptions = "%s,extents,mballoc" % (mountfsoptions)
1064 return mountfsoptions
1067 def sys_get_elan_position_file():
1068 procfiles = ["/proc/elan/device0/position",
1069 "/proc/qsnet/elan4/device0/position",
1070 "/proc/qsnet/elan3/device0/position"]
1072 if os.access(p, os.R_OK):
1076 def sys_get_local_nid(net_type, wildcard, cluster_id):
1077 """Return the local nid."""
1079 if sys_get_elan_position_file():
1080 local = sys_get_local_address('elan', '*', cluster_id)
1082 local = sys_get_local_address(net_type, wildcard, cluster_id)
1085 def sys_get_local_address(net_type, wildcard, cluster_id):
1086 """Return the local address for the network type."""
1088 if net_type in ('tcp','openib','iib','vib','ra'):
1090 iface, star = string.split(wildcard, ':')
1091 local = if2addr(iface)
1093 panic ("unable to determine ip for:", wildcard)
1095 host = socket.gethostname()
1096 local = socket.gethostbyname(host)
1097 elif net_type == 'elan':
1098 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
1099 f = sys_get_elan_position_file()
1101 panic ("unable to determine local Elan ID")
1104 lines = fp.readlines()
1108 if a[0] == 'NodeId':
1112 nid = my_int(cluster_id) + my_int(elan_id)
1113 local = "%d" % (nid)
1114 except ValueError, e:
1118 elif net_type == 'lo':
1119 fixme("automatic local address for loopback")
1120 elif net_type == 'gm':
1121 fixme("automatic local address for GM")
1125 def sys_get_branch():
1126 """Returns kernel release"""
1128 fp = open('/proc/sys/kernel/osrelease')
1129 lines = fp.readlines()
1133 version = string.split(l)
1134 a = string.split(version[0], '.')
1135 return a[0] + '.' + a[1]
1140 # XXX: instead of device_list, ask for $name and see what we get
1141 def is_prepared(name):
1142 """Return true if a device exists for the name"""
1143 if config.lctl_dump:
1145 if (config.noexec or config.record) and config.cleanup:
1148 # expect this format:
1149 # 1 UP ldlm ldlm ldlm_UUID 2
1150 out = lctl.device_list()
1152 if name == string.split(s)[3]:
1154 except CommandError, e:
1158 def net_is_prepared():
1159 """If the any device exists, then assume that all networking
1160 has been configured"""
1161 out = lctl.device_list()
1164 def fs_is_mounted(path):
1165 """Return true if path is a mounted lustre filesystem"""
1167 fp = open('/proc/mounts')
1168 lines = fp.readlines()
1172 if a[1] == path and a[2] == 'lustre_lite':
1178 def kmod_find(src_dir, dev_dir, modname):
1179 modbase = src_dir +'/'+ dev_dir +'/'+ modname
1180 for modext in '.ko', '.o':
1181 module = modbase + modext
1183 if os.access(module, os.R_OK):
1189 def kmod_info(modname):
1190 """Returns reference count for passed module name."""
1192 fp = open('/proc/modules')
1193 lines = fp.readlines()
1196 # please forgive my tired fingers for this one
1197 ret = filter(lambda word, mod = modname: word[0] == mod,
1198 map(lambda line: string.split(line), lines))
1202 except Exception, e:
1206 """Presents kernel module"""
1207 def __init__(self, src_dir, dev_dir, name):
1208 self.src_dir = src_dir
1209 self.dev_dir = dev_dir
1212 # FIXME we ignore the failure of loading gss module, because we might
1213 # don't need it at all.
1216 log ('loading module:', self.name, 'srcdir',
1217 self.src_dir, 'devdir', self.dev_dir)
1219 module = kmod_find(self.src_dir, self.dev_dir,
1221 if not module and self.name != 'ptlrpcs_gss':
1222 panic('module not found:', self.name)
1223 (rc, out) = run('/sbin/insmod', module)
1225 if self.name == 'ptlrpcs_gss':
1226 print "Warning: not support gss security!"
1228 raise CommandError('insmod', out, rc)
1230 (rc, out) = run('/sbin/modprobe', self.name)
1232 if self.name == 'ptlrpcs_gss':
1233 print "Warning: not support gss security!"
1235 raise CommandError('modprobe', out, rc)
1239 log('unloading module:', self.name)
1240 (rc, out) = run('/sbin/rmmod', self.name)
1242 log('unable to unload module:', self.name +
1243 "(" + self.refcount() + ")")
1247 """Returns module info if any."""
1248 return kmod_info(self.name)
1251 """Returns 1 if module is loaded. Otherwise 0 is returned."""
1258 """Returns module refcount."""
1265 """Returns 1 if module is used, otherwise 0 is returned."""
1271 if users and users != '(unused)' and users != '-':
1279 """Returns 1 if module is busy, otherwise 0 is returned."""
1280 if self.loaded() and (self.used() or self.refcount() != '0'):
1286 """Manage kernel modules"""
1287 def __init__(self, lustre_dir, portals_dir):
1288 self.lustre_dir = lustre_dir
1289 self.portals_dir = portals_dir
1290 self.kmodule_list = []
1292 def find_module(self, modname):
1293 """Find module by module name"""
1294 for mod in self.kmodule_list:
1295 if mod.name == modname:
1299 def add_portals_module(self, dev_dir, modname):
1300 """Append a module to list of modules to load."""
1302 mod = self.find_module(modname)
1304 mod = kmod(self.portals_dir, dev_dir, modname)
1305 self.kmodule_list.append(mod)
1307 def add_lustre_module(self, dev_dir, modname):
1308 """Append a module to list of modules to load."""
1310 mod = self.find_module(modname)
1312 mod = kmod(self.lustre_dir, dev_dir, modname)
1313 self.kmodule_list.append(mod)
1315 def load_modules(self):
1316 """Load all the modules in the list in the order they appear."""
1317 for mod in self.kmodule_list:
1318 if mod.loaded() and not config.noexec:
1322 def cleanup_modules(self):
1323 """Unload the modules in the list in reverse order."""
1324 rev = self.kmodule_list
1327 if (not mod.loaded() or mod.busy()) and not config.noexec:
1330 if mod.name == 'portals' and config.dump:
1331 lctl.dump(config.dump)
1334 # ============================================================
1335 # Classes to prepare and cleanup the various objects
1338 """ Base class for the rest of the modules. The default cleanup method is
1339 defined here, as well as some utilitiy funcs.
1341 def __init__(self, module_name, db):
1343 self.module_name = module_name
1344 self.name = self.db.getName()
1345 self.uuid = self.db.getUUID()
1349 def info(self, *args):
1350 msg = string.join(map(str,args))
1351 print self.module_name + ":", self.name, self.uuid, msg
1354 """ default cleanup, used for most modules """
1357 lctl.cleanup(self.name, self.uuid, config.force)
1358 except CommandError, e:
1359 log(self.module_name, "cleanup failed: ", self.name)
1363 def add_module(self, manager):
1364 """Adds all needed modules in the order they appear."""
1367 def safe_to_clean(self):
1370 def safe_to_clean_modules(self):
1371 return self.safe_to_clean()
1373 class Network(Module):
1374 def __init__(self,db):
1375 Module.__init__(self, 'NETWORK', db)
1376 self.net_type = self.db.get_val('nettype')
1377 self.nid = self.db.get_val('nid', '*')
1378 self.cluster_id = self.db.get_val('clusterid', "0")
1379 self.port = self.db.get_val_int('port', 0)
1382 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1384 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1385 self.generic_nid = 1
1386 debug("nid:", self.nid)
1388 self.generic_nid = 0
1390 self.nid_uuid = self.nid_to_uuid(self.nid)
1391 self.hostaddr = self.db.get_hostaddr()
1392 if len(self.hostaddr) == 0:
1393 self.hostaddr.append(self.nid)
1394 if '*' in self.hostaddr[0]:
1395 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1396 if not self.hostaddr[0]:
1397 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1398 debug("hostaddr:", self.hostaddr[0])
1400 def add_module(self, manager):
1401 manager.add_portals_module("libcfs", 'libcfs')
1402 manager.add_portals_module("portals", 'portals')
1404 if node_needs_router():
1405 manager.add_portals_module("router", 'kptlrouter')
1406 if self.net_type == 'tcp':
1407 manager.add_portals_module("knals/socknal", 'ksocknal')
1408 if self.net_type == 'elan':
1409 manager.add_portals_module("knals/qswnal", 'kqswnal')
1410 if self.net_type == 'gm':
1411 manager.add_portals_module("knals/gmnal", 'kgmnal')
1412 if self.net_type == 'openib':
1413 manager.add_portals_module("knals/openibnal", 'kopenibnal')
1414 if self.net_type == 'iib':
1415 manager.add_portals_module("knals/iibnal", 'kiibnal')
1416 if self.net_type == 'vib':
1417 self.add_portals_module("knals/vibnal", 'kvibnal')
1418 if self.net_type == 'lo':
1419 manager.add_portals_module("knals/lonal", 'klonal')
1420 if self.net_type == 'ra':
1421 manager.add_portals_module("knals/ranal", 'kranal')
1423 def nid_to_uuid(self, nid):
1424 return "NID_%s_UUID" %(nid,)
1427 if not config.record and net_is_prepared():
1429 self.info(self.net_type, self.nid, self.port)
1430 if not (config.record and self.generic_nid):
1431 lctl.network(self.net_type, self.nid)
1432 if self.net_type == 'tcp':
1434 for hostaddr in self.db.get_hostaddr():
1435 ip = string.split(hostaddr, '/')[0]
1436 if len(string.split(hostaddr, '/')) == 2:
1437 netmask = string.split(hostaddr, '/')[1]
1440 lctl.add_interface(self.net_type, ip, netmask)
1441 if self.net_type == 'elan':
1443 if self.port and node_is_router():
1444 run_one_acceptor(self.port)
1445 self.connect_peer_gateways()
1447 def connect_peer_gateways(self):
1448 for router in self.db.lookup_class('node'):
1449 if router.get_val_int('router', 0):
1450 for netuuid in router.get_networks():
1451 net = self.db.lookup(netuuid)
1453 if (gw.cluster_id == self.cluster_id and
1454 gw.net_type == self.net_type):
1455 if gw.nid != self.nid:
1458 def disconnect_peer_gateways(self):
1459 for router in self.db.lookup_class('node'):
1460 if router.get_val_int('router', 0):
1461 for netuuid in router.get_networks():
1462 net = self.db.lookup(netuuid)
1464 if (gw.cluster_id == self.cluster_id and
1465 gw.net_type == self.net_type):
1466 if gw.nid != self.nid:
1469 except CommandError, e:
1470 print "disconnect failed: ", self.name
1474 def safe_to_clean(self):
1475 return not net_is_prepared()
1478 self.info(self.net_type, self.nid, self.port)
1480 stop_acceptor(self.port)
1481 if node_is_router():
1482 self.disconnect_peer_gateways()
1483 if self.net_type == 'tcp':
1484 for hostaddr in self.db.get_hostaddr():
1485 ip = string.split(hostaddr, '/')[0]
1486 lctl.del_interface(self.net_type, ip)
1488 def correct_level(self, level, op=None):
1491 class RouteTable(Module):
1492 def __init__(self,db):
1493 Module.__init__(self, 'ROUTES', db)
1495 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1497 # only setup connections for tcp, openib, and iib NALs
1499 if not net_type in ('tcp','openib','iib','vib','ra'):
1502 # connect to target if route is to single node and this node is the gw
1503 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1504 if not local_cluster(net_type, tgt_cluster_id):
1505 panic("target", lo, " not on the local cluster")
1506 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1507 # connect to gateway if this node is not the gw
1508 elif (local_cluster(net_type, gw_cluster_id)
1509 and not local_interface(net_type, gw_cluster_id, gw)):
1510 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1515 panic("no server for nid", lo)
1518 return Network(srvdb)
1521 if not config.record and net_is_prepared():
1524 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1525 lctl.add_route(net_type, gw, lo, hi)
1526 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1530 def safe_to_clean(self):
1531 return not net_is_prepared()
1534 if net_is_prepared():
1535 # the network is still being used, don't clean it up
1537 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1538 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1541 lctl.disconnect(srv)
1542 except CommandError, e:
1543 print "disconnect failed: ", self.name
1548 lctl.del_route(net_type, gw, lo, hi)
1549 except CommandError, e:
1550 print "del_route failed: ", self.name
1554 class Management(Module):
1555 def __init__(self, db):
1556 Module.__init__(self, 'MGMT', db)
1558 def add_module(self, manager):
1559 manager.add_lustre_module('lvfs', 'lvfs')
1560 manager.add_lustre_module('obdclass', 'obdclass')
1561 manager.add_lustre_module('ptlrpc', 'ptlrpc')
1562 manager.add_lustre_module('mgmt', 'mgmt_svc')
1565 if not config.record and is_prepared(self.name):
1568 lctl.newdev("mgmt", self.name, self.uuid)
1570 def safe_to_clean(self):
1574 if is_prepared(self.name):
1575 Module.cleanup(self)
1577 def correct_level(self, level, op=None):
1580 # This is only needed to load the modules; the LDLM device
1581 # is now created automatically.
1583 def __init__(self,db):
1584 Module.__init__(self, 'LDLM', db)
1586 def add_module(self, manager):
1587 manager.add_lustre_module('lvfs', 'lvfs')
1588 manager.add_lustre_module('obdclass', 'obdclass')
1589 manager.add_lustre_module('sec', 'ptlrpcs')
1590 manager.add_lustre_module('ptlrpc', 'ptlrpc')
1591 manager.add_lustre_module('sec/gss', 'ptlrpcs_gss')
1599 def correct_level(self, level, op=None):
1603 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1604 Module.__init__(self, 'LOV', db)
1605 if name_override != None:
1606 self.name = "lov_%s" % name_override
1607 self.mds_uuid = self.db.get_first_ref('mds')
1608 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1609 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1610 self.pattern = self.db.get_val_int('stripepattern', 0)
1611 self.devlist = self.db.get_lov_tgts('lov_tgt')
1612 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1615 self.desc_uuid = self.uuid
1616 self.uuid = generate_client_uuid(self.name)
1617 self.fs_name = fs_name
1619 self.config_only = 1
1621 self.config_only = None
1622 mds = self.db.lookup(self.mds_uuid)
1623 self.mds_name = mds.getName()
1624 for (obd_uuid, index, gen, active) in self.devlist:
1627 self.obdlist.append(obd_uuid)
1628 obd = self.db.lookup(obd_uuid)
1629 osc = get_osc(obd, self.uuid, fs_name)
1631 self.osclist.append((osc, index, gen, active))
1633 panic('osc not found:', obd_uuid)
1642 if not config.record and is_prepared(self.name):
1644 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1645 self.stripe_off, self.pattern, self.devlist,
1647 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1648 self.stripe_sz, self.stripe_off, self.pattern)
1649 for (osc, index, gen, active) in self.osclist:
1650 target_uuid = osc.target_uuid
1652 # Only ignore connect failures with --force, which
1653 # isn't implemented here yet.
1655 osc.prepare(ignore_connect_failure=0)
1656 except CommandError, e:
1657 print "Error preparing OSC %s\n" % osc.uuid
1659 lctl.lov_add_osc(self.name, target_uuid, index, gen)
1662 for (osc, index, gen, active) in self.osclist:
1663 target_uuid = osc.target_uuid
1665 if is_prepared(self.name):
1666 Module.cleanup(self)
1667 if self.config_only:
1668 panic("Can't clean up config_only LOV ", self.name)
1670 def add_module(self, manager):
1671 if self.config_only:
1672 panic("Can't load modules for config_only LOV ", self.name)
1673 for (osc, index, gen, active) in self.osclist:
1674 osc.add_module(manager)
1676 manager.add_lustre_module('lov', 'lov')
1678 def correct_level(self, level, op=None):
1682 def __init__(self, db, uuid, fs_name, name_override = None):
1683 Module.__init__(self, 'LMV', db)
1684 if name_override != None:
1685 self.name = "lmv_%s" % name_override
1687 self.devlist = self.db.get_lmv_tgts('lmv_tgt')
1688 if self.devlist == None:
1689 self.devlist = self.db.get_refs('mds')
1692 self.desc_uuid = self.uuid
1694 self.fs_name = fs_name
1695 for mds_uuid in self.devlist:
1696 mds = self.db.lookup(mds_uuid)
1698 panic("MDS not found!")
1699 mdc = MDC(mds, self.uuid, fs_name)
1701 self.mdclist.append(mdc)
1703 panic('mdc not found:', mds_uuid)
1706 if config.record and is_prepared(self.name):
1710 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid)
1711 for mdc in self.mdclist:
1713 # Only ignore connect failures with --force, which
1714 # isn't implemented here yet.
1715 mdc.prepare(ignore_connect_failure=0)
1716 except CommandError, e:
1717 print "Error preparing LMV %s\n" % mdc.uuid
1719 lctl.lmv_add_mdc(self.name, mdc.target_uuid)
1722 for mdc in self.mdclist:
1724 if is_prepared(self.name):
1725 Module.cleanup(self)
1727 def add_module(self, manager):
1728 for mdc in self.mdclist:
1729 mdc.add_module(manager)
1731 manager.add_lustre_module('lmv', 'lmv')
1733 def correct_level(self, level, op=None):
1737 def __init__(self,db):
1738 Module.__init__(self, 'GKD', db)
1739 target_uuid = self.db.get_first_ref('target')
1740 self.target = self.db.lookup(target_uuid)
1741 self.name = self.target.getName()
1743 active_uuid = get_active_target(self.target)
1745 panic("No target device found:", target_uuid)
1746 if active_uuid == self.uuid:
1751 self.uuid = target_uuid
1754 if is_prepared(self.name):
1757 debug(self.uuid, "not active")
1761 lctl.newdev("gks", self.name, self.uuid, setup ="")
1762 if not is_prepared('GKT'):
1763 lctl.newdev("gkt", 'GKT', 'GKT_UUID', setup ="")
1767 debug(self.uuid, "not active")
1770 if is_prepared(self.name):
1772 lctl.cleanup(self.name, self.uuid, config.force,
1774 except CommandError, e:
1775 log(self.module_name, "cleanup failed: ", self.name)
1778 Module.cleanup(self)
1779 if is_prepared('GKT'):
1781 lctl.cleanup("GKT", "GKT_UUID", config.force,
1783 except CommandError, e:
1784 print "cleanup failed: ", self.name
1788 def add_module(self, manager):
1790 manager.add_lustre_module('sec/gks', 'gks')
1791 manager.add_lustre_module('sec/gks', 'gkc')
1793 def correct_level(self, level, op=None):
1796 class CONFDEV(Module):
1797 def __init__(self, db, name, target_uuid, uuid):
1798 Module.__init__(self, 'CONFDEV', db)
1799 self.devpath = self.db.get_val('devpath','')
1800 self.backdevpath = self.db.get_val('devpath','')
1801 self.size = self.db.get_val_int('devsize', 0)
1802 self.journal_size = self.db.get_val_int('journalsize', 0)
1803 self.fstype = self.db.get_val('fstype', '')
1804 self.backfstype = self.db.get_val('backfstype', '')
1805 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1806 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1807 self.target = self.db.lookup(target_uuid)
1808 self.name = "conf_%s" % self.target.getName()
1809 self.client_uuids = self.target.get_refs('client')
1810 self.fs_uuid = self.db.get_first_ref('filesystem')
1811 self.obdtype = self.db.get_val('obdtype', '')
1813 self.mds_sec = self.db.get_val('mds_sec', '')
1814 self.oss_sec = self.db.get_val('oss_sec', '')
1815 self.deny_sec = self.db.get_val('deny_sec', '')
1817 if config.mds_mds_sec:
1818 self.mds_sec = config.mds_mds_sec
1819 if config.mds_oss_sec:
1820 self.oss_sec = config.mds_oss_sec
1821 if config.mds_deny_sec:
1823 self.deny_sec = "%s,%s" %(self.deny_sec, config.mds_deny_sec)
1825 self.deny_sec = config.mds_deny_sec
1827 if self.obdtype == None:
1828 self.obdtype = 'dumb'
1830 self.conf_name = name
1831 self.conf_uuid = uuid
1832 self.realdev = self.devpath
1837 lmv_uuid = self.db.get_first_ref('lmv')
1838 if lmv_uuid != None:
1839 self.lmv = self.db.lookup(lmv_uuid)
1840 if self.lmv != None:
1841 self.client_uuids = self.lmv.get_refs('client')
1843 if self.target.get_class() == 'mds':
1844 if self.target.get_val('failover', 0):
1845 self.failover_mds = 'f'
1847 self.failover_mds = 'n'
1848 self.format = self.db.get_val('autoformat', "no")
1850 self.format = self.db.get_val('autoformat', "yes")
1851 self.osdtype = self.db.get_val('osdtype')
1852 ost = self.db.lookup(target_uuid)
1853 if ost.get_val('failover', 0):
1854 self.failover_ost = 'f'
1856 self.failover_ost = 'n'
1858 self.inode_size = self.get_inode_size()
1860 if self.lmv != None:
1861 client_uuid = self.name + "_lmv_UUID"
1862 self.master = LMV(self.lmv, client_uuid,
1863 self.conf_name, self.conf_name)
1865 def get_inode_size(self):
1866 inode_size = self.db.get_val_int('inodesize', 0)
1867 if inode_size == 0 and self.target.get_class() == 'mds':
1869 # default inode size for case when neither LOV either
1870 # LMV is accessible.
1871 self.inode_size = 256
1873 # find the LOV for this MDS
1874 lovconfig_uuid = self.target.get_first_ref('lovconfig')
1875 if lovconfig_uuid or self.lmv != None:
1876 if self.lmv != None:
1877 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1878 lovconfig = self.lmv.lookup(lovconfig_uuid)
1879 lov_uuid = lovconfig.get_first_ref('lov')
1880 if lov_uuid == None:
1881 panic(self.target.getName() + ": No LOV found for lovconfig ",
1884 lovconfig = self.target.lookup(lovconfig_uuid)
1885 lov_uuid = lovconfig.get_first_ref('lov')
1886 if lov_uuid == None:
1887 panic(self.target.getName() + ": No LOV found for lovconfig ",
1889 if self.lmv != None:
1890 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1891 lovconfig = self.lmv.lookup(lovconfig_uuid)
1892 lov_uuid = lovconfig.get_first_ref('lov')
1894 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, self.name,
1897 # default stripe count controls default inode_size
1898 if lov.stripe_cnt > 0:
1899 stripe_count = lov.stripe_cnt
1901 stripe_count = len(lov.devlist)
1902 if stripe_count > 77:
1904 elif stripe_count > 35:
1906 elif stripe_count > 13:
1908 #elif stripe_count > 3:
1915 def get_mount_options(self, blkdev):
1916 options = def_mount_options(self.fstype,
1917 self.target.get_class())
1919 if config.mountfsoptions:
1921 options = "%s,%s" %(options, config.mountfsoptions)
1923 options = config.mountfsoptions
1924 if self.mountfsoptions:
1925 options = "%s,%s" %(options, self.mountfsoptions)
1927 if self.mountfsoptions:
1929 options = "%s,%s" %(options, self.mountfsoptions)
1931 options = self.mountfsoptions
1933 if self.fstype == 'smfs':
1935 options = "%s,type=%s,dev=%s" %(options, self.backfstype,
1938 options = "type=%s,dev=%s" %(self.backfstype,
1941 if self.target.get_class() == 'mds':
1943 options = "%s,acl,user_xattr,iopen_nopriv" %(options)
1945 options = "iopen_nopriv"
1950 if is_prepared(self.name):
1953 blkdev = block_dev(self.devpath, self.size, self.fstype,
1954 config.reformat, self.format, self.journal_size,
1955 self.inode_size, self.mkfsoptions, self.backfstype,
1958 if self.fstype == 'smfs':
1963 mountfsoptions = self.get_mount_options(blkdev)
1965 self.info(self.target.get_class(), realdev, mountfsoptions,
1966 self.fstype, self.size, self.format)
1968 lctl.newdev("confobd", self.name, self.uuid,
1969 setup ="%s %s %s" %(realdev, self.fstype,
1972 self.mountfsoptions = mountfsoptions
1973 self.realdev = realdev
1975 def add_module(self, manager):
1976 manager.add_lustre_module('obdclass', 'confobd')
1978 # this method checks if current OBD belong to the same FS as passed
1979 # mount point uuid. If not - do not write mountpoint and echo client
1980 # to log, it is not needed, but take damn long time (WB test case)
1982 def belong_to_fs(self, mtpt_uuid):
1983 mtpt = self.db.lookup(mtpt_uuid)
1984 fs_uuid = mtpt.get_first_ref('filesystem')
1986 if not self.fs_uuid or self.fs_uuid == "" or fs_uuid == self.fs_uuid:
1991 def write_conf(self):
1992 if self.target.get_class() == 'ost':
1994 lctl.clear_log(self.name, self.target.getName() + '-conf')
1995 lctl.record(self.name, self.target.getName() + '-conf')
1996 lctl.newdev(self.osdtype, self.conf_name, self.conf_uuid,
1997 setup ="%s %s %s %s" %(self.realdev, self.fstype,
1999 self.mountfsoptions))
2001 lctl.clear_log(self.name, 'OSS-conf')
2002 lctl.record(self.name, 'OSS-conf')
2003 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
2008 if self.target.get_class() == 'mds':
2009 if self.master != None:
2010 master_name = self.master.name
2012 master_name = 'dumb'
2015 lctl.clear_log(self.name, self.target.getName() + '-conf')
2016 lctl.record(self.name, self.target.getName() + '-conf')
2017 lctl.attach("mds", self.conf_name, self.conf_uuid)
2019 lctl.set_security(self.conf_name, "mds_sec", self.mds_sec)
2021 lctl.set_security(self.conf_name, "oss_sec", self.oss_sec)
2023 for flavor in string.split(self.deny_sec, ','):
2024 lctl.set_security(self.conf_name, "deny_sec", flavor)
2025 lctl.newdev("mds", self.conf_name, self.conf_uuid,
2026 setup ="%s %s %s %s %s %s" %(self.realdev, self.fstype,
2027 self.conf_name, self.mountfsoptions,
2028 master_name, self.obdtype))
2032 if not self.client_uuids:
2036 client_uuid = self.conf_name + "_lmv_UUID"
2037 lmv = VMDC(self.lmv, client_uuid, self.conf_name, self.conf_name);
2041 for uuid in self.client_uuids:
2042 log("recording client:", uuid)
2043 client_uuid = generate_client_uuid(self.name)
2044 client = VOSC(self.db.lookup(uuid), client_uuid,
2045 self.target.getName(), self.name)
2047 lctl.clear_log(self.name, self.target.getName())
2048 lctl.record(self.name, self.target.getName())
2050 lctl.mount_option(self.target.getName(), client.get_name(), "", "")
2052 process_updates(self.db, self.name, self.target.getName(), lmv, client)
2054 lctl.clear_log(self.name, self.target.getName() + '-clean')
2055 lctl.record(self.name, self.target.getName() + '-clean')
2057 lctl.del_mount_option(self.target.getName())
2065 # record logs for each client
2067 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
2069 config_options = CONFIG_FILE
2071 for node_db in self.db.lookup_class('node'):
2072 client_name = node_db.getName()
2073 for prof_uuid in node_db.get_refs('profile'):
2074 prof_db = node_db.lookup(prof_uuid)
2075 # refactor this into a funtion to test "clientness"
2077 for ref_class, ref_uuid in prof_db.get_all_refs():
2078 if ref_class in ('mountpoint','echoclient') and self.belong_to_fs(ref_uuid):
2079 debug("recording:", client_name)
2080 log("recording mountpoint:", ref_uuid)
2081 old_noexec = config.noexec
2083 noexec_opt = ('', '-n')
2084 ret, out = run (sys.argv[0],
2085 noexec_opt[old_noexec == 1],
2086 " -v --record --nomod",
2087 "--record_log", client_name,
2088 "--record_device", self.name,
2089 "--node", client_name,
2092 for s in out: log("record> ", string.strip(s))
2093 ret, out = run (sys.argv[0],
2094 noexec_opt[old_noexec == 1],
2095 "--cleanup -v --record --nomod",
2096 "--record_log", client_name + "-clean",
2097 "--record_device", self.name,
2098 "--node", client_name,
2101 for s in out: log("record> ", string.strip(s))
2102 config.noexec = old_noexec
2106 lctl.start(self.name, self.conf_name)
2107 except CommandError, e:
2109 if self.target.get_class() == 'ost':
2110 if not is_prepared('OSS'):
2112 lctl.start(self.name, 'OSS')
2113 except CommandError, e:
2117 if is_prepared(self.name):
2119 lctl.cleanup(self.name, self.uuid, 0, 0)
2120 clean_dev(self.devpath, self.fstype,
2121 self.backfstype, self.backdevpath)
2122 except CommandError, e:
2123 log(self.module_name, "cleanup failed: ", self.name)
2126 Module.cleanup(self)
2128 class MDSDEV(Module):
2129 def __init__(self,db):
2130 Module.__init__(self, 'MDSDEV', db)
2131 self.devpath = self.db.get_val('devpath','')
2132 self.backdevpath = self.db.get_val('devpath','')
2133 self.size = self.db.get_val_int('devsize', 0)
2134 self.journal_size = self.db.get_val_int('journalsize', 0)
2135 self.fstype = self.db.get_val('fstype', '')
2136 self.backfstype = self.db.get_val('backfstype', '')
2137 self.nspath = self.db.get_val('nspath', '')
2138 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
2139 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
2140 self.obdtype = self.db.get_val('obdtype', '')
2141 self.root_squash = self.db.get_val('root_squash', '')
2142 self.no_root_squash = self.db.get_val('no_root_squash', '')
2144 target_uuid = self.db.get_first_ref('target')
2145 self.target = self.db.lookup(target_uuid)
2146 self.name = self.target.getName()
2150 lmv_uuid = self.db.get_first_ref('lmv')
2151 if lmv_uuid != None:
2152 self.lmv = self.db.lookup(lmv_uuid)
2154 active_uuid = get_active_target(self.target)
2156 panic("No target device found:", target_uuid)
2157 if active_uuid == self.uuid:
2159 group = self.target.get_val('group')
2160 if config.group and config.group != group:
2165 self.uuid = target_uuid
2168 if self.lmv != None:
2169 client_uuid = self.name + "_lmv_UUID"
2170 self.master = LMV(self.lmv, client_uuid,
2171 self.name, self.name)
2173 self.confobd = CONFDEV(self.db, self.name,
2174 target_uuid, self.uuid)
2176 def add_module(self, manager):
2178 manager.add_lustre_module('mdc', 'mdc')
2179 manager.add_lustre_module('osc', 'osc')
2180 manager.add_lustre_module('ost', 'ost')
2181 manager.add_lustre_module('lov', 'lov')
2182 manager.add_lustre_module('mds', 'mds')
2184 if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
2185 manager.add_lustre_module(self.fstype, self.fstype)
2188 manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
2190 # if fstype is smfs, then we should also take care about backing
2192 if self.fstype == 'smfs':
2193 manager.add_lustre_module(self.backfstype, self.backfstype)
2194 manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
2196 for option in string.split(self.mountfsoptions, ','):
2197 if option == 'snap':
2198 if not self.fstype == 'smfs':
2199 panic("mountoptions has 'snap', but fstype is not smfs.")
2200 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
2201 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
2204 if self.master != None:
2205 self.master.add_module(manager)
2207 # add CONFDEV modules
2208 if self.confobd != None:
2209 self.confobd.add_module(manager)
2211 def write_conf(self):
2212 if config.write_conf:
2214 debug(self.uuid, "not active")
2216 self.confobd.write_conf()
2218 if is_prepared(self.name):
2221 debug(self.uuid, "not active")
2224 self.confobd.prepare()
2225 self.confobd.write_conf()
2226 self.confobd.cleanup()
2229 if is_prepared(self.name):
2232 debug(self.uuid, "not active")
2236 self.confobd.prepare()
2238 self.confobd.write_conf()
2241 if self.master != None:
2242 self.master.prepare()
2244 if not config.record:
2245 self.confobd.start()
2247 if not is_prepared('MDT'):
2248 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
2250 if development_mode():
2251 # set lsd upcall path
2252 procentry = "/proc/fs/lustre/mds/lsd_upcall"
2253 upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall")
2254 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
2255 print "MDS Warning: failed to set lsd cache upcall"
2257 run("echo ", upcall, " > ", procentry)
2258 # set lacl upcall path
2259 procentry = "/proc/fs/lustre/mds/lacl_upcall"
2260 upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lacl_upcall")
2261 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
2262 print "MDS Warning: failed to set remote acl upcall"
2264 run("echo ", upcall, " > ", procentry)
2266 if config.root_squash == None:
2267 config.root_squash = self.root_squash
2268 if config.no_root_squash == None:
2269 config.no_root_squash = self.no_root_squash
2270 if config.root_squash:
2271 if config.no_root_squash:
2272 nsnid = config.no_root_squash
2275 lctl.root_squash(self.name, config.root_squash, nsnid)
2277 def msd_remaining(self):
2278 out = lctl.device_list()
2280 if string.split(s)[2] in ('mds',):
2283 def safe_to_clean(self):
2286 def safe_to_clean_modules(self):
2287 return not self.msd_remaining()
2291 debug(self.uuid, "not active")
2294 if is_prepared(self.name):
2296 lctl.cleanup(self.name, self.uuid, config.force,
2298 except CommandError, e:
2299 log(self.module_name, "cleanup failed: ", self.name)
2302 Module.cleanup(self)
2304 if self.master != None:
2305 self.master.cleanup()
2306 if not self.msd_remaining() and is_prepared('MDT'):
2308 lctl.cleanup("MDT", "MDT_UUID", config.force,
2310 except CommandError, e:
2311 print "cleanup failed: ", self.name
2316 self.confobd.cleanup()
2318 def correct_level(self, level, op=None):
2319 #if self.master != None:
2324 def __init__(self, db):
2325 Module.__init__(self, 'OSD', db)
2326 self.osdtype = self.db.get_val('osdtype')
2327 self.devpath = self.db.get_val('devpath', '')
2328 self.backdevpath = self.db.get_val('devpath', '')
2329 self.size = self.db.get_val_int('devsize', 0)
2330 self.journal_size = self.db.get_val_int('journalsize', 0)
2331 self.inode_size = self.db.get_val_int('inodesize', 0)
2332 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
2333 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
2334 self.fstype = self.db.get_val('fstype', '')
2335 self.backfstype = self.db.get_val('backfstype', '')
2336 self.nspath = self.db.get_val('nspath', '')
2337 target_uuid = self.db.get_first_ref('target')
2338 ost = self.db.lookup(target_uuid)
2339 self.name = ost.getName()
2340 self.format = self.db.get_val('autoformat', 'yes')
2341 if ost.get_val('failover', 0):
2342 self.failover_ost = 'f'
2344 self.failover_ost = 'n'
2346 self.deny_sec = self.db.get_val('deny_sec', '')
2348 if config.ost_deny_sec:
2350 self.deny_sec = "%s,%s" %(self.deny_sec, config.ost_deny_sec)
2352 self.deny_sec = config.ost_deny_sec
2354 active_uuid = get_active_target(ost)
2356 panic("No target device found:", target_uuid)
2357 if active_uuid == self.uuid:
2359 group = ost.get_val('group')
2360 if config.group and config.group != group:
2365 self.uuid = target_uuid
2366 self.confobd = CONFDEV(self.db, self.name,
2367 target_uuid, self.uuid)
2369 def add_module(self, manager):
2372 manager.add_lustre_module('ost', 'ost')
2374 if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
2375 manager.add_lustre_module(self.fstype, self.fstype)
2378 manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
2380 if self.fstype == 'smfs':
2381 manager.add_lustre_module(self.backfstype, self.backfstype)
2382 manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
2384 for option in self.mountfsoptions:
2385 if option == 'snap':
2386 if not self.fstype == 'smfs':
2387 panic("mountoptions with snap, but fstype is not smfs\n")
2388 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
2389 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
2391 manager.add_lustre_module(self.osdtype, self.osdtype)
2393 # add CONFDEV modules
2394 if self.confobd != None:
2395 self.confobd.add_module(manager)
2398 if is_prepared(self.name):
2401 debug(self.uuid, "not active")
2406 if self.osdtype == 'obdecho':
2407 self.info(self.osdtype)
2408 lctl.newdev("obdecho", self.name, self.uuid)
2409 if not is_prepared('OSS'):
2410 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup="")
2412 self.confobd.prepare()
2414 self.confobd.write_conf()
2415 if not config.record:
2416 self.confobd.start()
2419 for flavor in string.split(self.deny_sec, ','):
2420 lctl.set_security(self.name, "deny_sec", flavor)
2422 def write_conf(self):
2423 if is_prepared(self.name):
2426 debug(self.uuid, "not active")
2430 if self.osdtype != 'obdecho':
2431 self.confobd.prepare()
2432 self.confobd.write_conf()
2433 if not config.write_conf:
2434 self.confobd.start()
2435 self.confobd.cleanup()
2437 def osd_remaining(self):
2438 out = lctl.device_list()
2440 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2443 def safe_to_clean(self):
2446 def safe_to_clean_modules(self):
2447 return not self.osd_remaining()
2451 debug(self.uuid, "not active")
2454 if is_prepared(self.name):
2457 lctl.cleanup(self.name, self.uuid, config.force,
2459 except CommandError, e:
2460 log(self.module_name, "cleanup failed: ", self.name)
2463 if not self.osd_remaining() and is_prepared('OSS'):
2465 lctl.cleanup("OSS", "OSS_UUID", config.force,
2467 except CommandError, e:
2468 print "cleanup failed: ", self.name
2472 if self.osdtype != 'obdecho':
2474 self.confobd.cleanup()
2476 def correct_level(self, level, op=None):
2479 # Generic client module, used by OSC and MDC
2480 class Client(Module):
2481 def __init__(self, tgtdb, uuid, module, fs_name,
2482 self_name=None, module_dir=None):
2483 self.target_name = tgtdb.getName()
2484 self.target_uuid = tgtdb.getUUID()
2485 self.module_dir = module_dir
2486 self.backup_targets = []
2487 self.module = module
2490 self.module = module
2491 self.module_name = string.upper(module)
2492 self.fs_name = fs_name
2494 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2495 self.target_name, fs_name)
2497 self.name = self_name
2498 if not self.module_dir:
2499 self.module_dir = module
2501 self.tgt_dev_uuid = get_active_target(tgtdb)
2502 if not self.tgt_dev_uuid:
2503 panic("No target device found for target(1):", self.target_name)
2507 self.lookup_server(tgtdb, self.tgt_dev_uuid)
2508 self.lookup_backup_targets()
2510 def add_module(self, manager):
2511 manager.add_lustre_module(self.module_dir, self.module)
2513 def lookup_server(self, db, srv_uuid):
2514 """ Lookup a server's network information """
2515 self._server_nets = get_ost_net(db, srv_uuid)
2516 if len(self._server_nets) == 0:
2517 panic ("Unable to find a server for:", srv_uuid)
2522 def get_servers(self):
2523 return self._server_nets
2525 def lookup_backup_targets(self):
2526 """ Lookup alternative network information """
2527 prof_list = toplustreDB.get_refs('profile')
2528 for prof_uuid in prof_list:
2529 prof_db = toplustreDB.lookup(prof_uuid)
2531 panic("profile:", prof_uuid, "not found.")
2532 for ref_class, ref_uuid in prof_db.get_all_refs():
2533 if ref_class in ('osd', 'mdsdev'):
2534 devdb = toplustreDB.lookup(ref_uuid)
2535 uuid = devdb.get_first_ref('target')
2536 if self.target_uuid == uuid and self.tgt_dev_uuid != ref_uuid:
2537 self.backup_targets.append(ref_uuid)
2539 def prepare(self, ignore_connect_failure = 0):
2540 self.info(self.target_uuid)
2541 if not config.record and is_prepared(self.name):
2544 srv = choose_local_server(self.get_servers())
2548 routes = find_route(self.get_servers())
2549 if len(routes) == 0:
2550 panic ("no route to", self.target_uuid)
2551 for (srv, r) in routes:
2552 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2553 except CommandError, e:
2554 if not ignore_connect_failure:
2558 if self.target_uuid in config.inactive and self.permits_inactive():
2559 debug("%s inactive" % self.target_uuid)
2560 inactive_p = "inactive"
2562 debug("%s active" % self.target_uuid)
2564 lctl.newdev(self.module, self.name, self.uuid,
2565 setup ="%s %s %s" % (self.target_uuid, srv.nid_uuid,
2567 for tgt_dev_uuid in self.backup_targets:
2568 this_nets = get_ost_net(toplustreDB, tgt_dev_uuid)
2569 if len(this_nets) == 0:
2570 panic ("Unable to find a server for:", tgt_dev_uuid)
2571 srv = choose_local_server(this_nets)
2575 routes = find_route(this_nets);
2576 if len(routes) == 0:
2577 panic("no route to", tgt_dev_uuid)
2578 for (srv, r) in routes:
2579 lctl.add_route_host(r[0]. srv.nid_uuid, r[1], r[3])
2581 lctl.add_conn(self.name, srv.nid_uuid);
2584 if is_prepared(self.name):
2585 Module.cleanup(self)
2587 srv = choose_local_server(self.get_servers())
2589 lctl.disconnect(srv)
2591 for (srv, r) in find_route(self.get_servers()):
2592 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2593 except CommandError, e:
2594 log(self.module_name, "cleanup failed: ", self.name)
2598 for tgt_dev_uuid in self.backup_targets:
2599 this_net = get_ost_net(toplustreDB, tgt_dev_uuid)
2600 srv = choose_local_server(this_net)
2602 lctl.disconnect(srv)
2604 for (srv, r) in find_route(this_net):
2605 lctl.del_route_host(r[0]. srv.nid_uuid, r[1], r[3])
2607 def correct_level(self, level, op=None):
2610 def deactivate(self):
2612 lctl.deactivate(self.name)
2613 except CommandError, e:
2614 log(self.module_name, "deactivate failed: ", self.name)
2619 def __init__(self, db, uuid, fs_name):
2620 Client.__init__(self, db, uuid, 'gkc', fs_name)
2622 def permits_inactive(self):
2626 def __init__(self, db, uuid, fs_name):
2627 Client.__init__(self, db, uuid, 'mdc', fs_name)
2629 def permits_inactive(self):
2633 def __init__(self, db, uuid, fs_name):
2634 Client.__init__(self, db, uuid, 'osc', fs_name)
2636 def permits_inactive(self):
2639 class CMOBD(Module):
2640 def __init__(self, db):
2641 Module.__init__(self, 'CMOBD', db)
2642 self.name = self.db.getName();
2643 self.uuid = generate_client_uuid(self.name)
2644 self.master_uuid = self.db.get_first_ref('masterobd')
2645 self.cache_uuid = self.db.get_first_ref('cacheobd')
2647 master_obd = self.db.lookup(self.master_uuid)
2649 panic('master obd not found:', self.master_uuid)
2651 cache_obd = self.db.lookup(self.cache_uuid)
2653 panic('cache obd not found:', self.cache_uuid)
2658 master_class = master_obd.get_class()
2659 cache_class = cache_obd.get_class()
2661 if master_class == 'lov':
2662 client_uuid = "%s_lov_master_UUID" % (self.name)
2663 self.master = LOV(master_obd, client_uuid, self.name,
2664 "master_%s" % (self.name));
2665 elif master_class == 'ost':
2666 client_uuid = "%s_ost_master_UUID" % (self.name)
2667 self.master = get_osc(master_obd, client_uuid, self.master_uuid)
2668 elif master_class == 'mds':
2669 client_uuid = "%s_mds_master_UUID" % (self.name)
2670 self.master = get_mdc(master_obd, self.master_uuid, client_uuid)
2671 elif master_class == 'lmv':
2672 client_uuid = "%s_lmv_master_UUID" % (self.name)
2673 self.master = LMV(master_obd, client_uuid, self.name,
2674 "master_%s" % (self.name));
2676 panic("unknown master obd class '%s'" %(master_class))
2678 if cache_class == 'ost':
2679 self.cache = get_osc(cache_obd, cache_obd.getUUID(),
2681 elif cache_class == 'mds':
2682 self.cache = get_mdc(cache_obd, self.cache_uuid,
2683 cache_obd.getUUID())
2685 panic("invalid cache obd class '%s'" %(cache_class))
2688 if not config.record and is_prepared(self.name):
2690 self.info(self.master_uuid, self.cache_uuid)
2691 self.master.prepare()
2692 lctl.newdev("cmobd", self.name, self.uuid,
2693 setup ="%s %s" %(self.master.uuid,
2702 def get_master_name(self):
2703 return self.master.name
2705 def get_cache_name(self):
2706 return self.cache.name
2709 if is_prepared(self.name):
2710 Module.cleanup(self)
2712 self.master.cleanup()
2714 def add_module(self, manager):
2715 manager.add_lustre_module('smfs', 'smfs')
2716 manager.add_lustre_module('cmobd', 'cmobd')
2717 self.master.add_module(manager)
2719 def correct_level(self, level, op=None):
2723 def __init__(self, db, uuid, name):
2724 Module.__init__(self, 'COBD', db)
2725 self.name = self.db.getName();
2726 self.uuid = generate_client_uuid(self.name)
2727 self.master_uuid = self.db.get_first_ref('masterobd')
2728 self.cache_uuid = self.db.get_first_ref('cacheobd')
2730 master_obd = self.db.lookup(self.master_uuid)
2732 panic('master obd not found:', self.master_uuid)
2734 cache_obd = self.db.lookup(self.cache_uuid)
2736 panic('cache obd not found:', self.cache_uuid)
2741 master_class = master_obd.get_class()
2742 cache_class = cache_obd.get_class()
2744 if master_class == 'ost' or master_class == 'lov':
2745 client_uuid = "%s_lov_master_UUID" % (self.name)
2746 self.master = LOV(master_obd, client_uuid, name,
2747 "master_%s" % (self.name));
2748 elif master_class == 'mds':
2749 self.master = get_mdc(db, self.master_uuid, name)
2750 elif master_class == 'lmv':
2751 client_uuid = "%s_lmv_master_UUID" % (self.name)
2752 self.master = LMV(master_obd, client_uuid, self.name,
2753 "master_%s" % (self.name));
2755 panic("unknown master obd class '%s'" %(master_class))
2757 if cache_class == 'ost' or cache_class == 'lov':
2758 client_uuid = "%s_lov_cache_UUID" % (self.name)
2759 self.cache = LOV(cache_obd, client_uuid, name,
2760 "cache_%s" % (self.name));
2761 elif cache_class == 'mds':
2762 self.cache = get_mdc(db, self.cache_uuid, name)
2763 elif cache_class == 'lmv':
2764 client_uuid = "%s_lmv_cache_UUID" % (self.name)
2765 self.cache = LMV(cache_obd, client_uuid, self.name,
2766 "cache_%s" % (self.name));
2768 panic("unknown cache obd class '%s'" %(cache_class))
2776 def get_master_name(self):
2777 return self.master.name
2779 def get_cache_name(self):
2780 return self.cache.name
2783 if not config.record and is_prepared(self.name):
2785 self.master.prepare()
2786 self.cache.prepare()
2787 self.info(self.master_uuid, self.cache_uuid)
2788 lctl.newdev("cobd", self.name, self.uuid,
2789 setup ="%s %s" %(self.master.name,
2793 if is_prepared(self.name):
2794 Module.cleanup(self)
2795 self.master.cleanup()
2796 self.cache.cleanup()
2798 def add_module(self, manager):
2799 manager.add_lustre_module('cobd', 'cobd')
2800 self.master.add_module(manager)
2802 # virtual interface for OSC and LOV
2804 def __init__(self, db, client_uuid, name, name_override = None):
2805 Module.__init__(self, 'VOSC', db)
2806 if db.get_class() == 'lov':
2807 self.osc = LOV(db, client_uuid, name, name_override)
2809 elif db.get_class() == 'cobd':
2810 self.osc = COBD(db, client_uuid, name)
2813 self.osc = OSC(db, client_uuid, name)
2817 return self.osc.get_uuid()
2820 return self.osc.get_name()
2828 def add_module(self, manager):
2829 self.osc.add_module(manager)
2831 def correct_level(self, level, op=None):
2832 return self.osc.correct_level(level, op)
2834 # virtual interface for MDC and LMV
2836 def __init__(self, db, client_uuid, name, name_override = None):
2837 Module.__init__(self, 'VMDC', db)
2838 if db.get_class() == 'lmv':
2839 self.mdc = LMV(db, client_uuid, name, name_override)
2840 elif db.get_class() == 'cobd':
2841 self.mdc = COBD(db, client_uuid, name)
2843 self.mdc = MDC(db, client_uuid, name)
2846 return self.mdc.uuid
2849 return self.mdc.name
2857 def add_module(self, manager):
2858 self.mdc.add_module(manager)
2860 def correct_level(self, level, op=None):
2861 return self.mdc.correct_level(level, op)
2863 class ECHO_CLIENT(Module):
2864 def __init__(self,db):
2865 Module.__init__(self, 'ECHO_CLIENT', db)
2866 self.obd_uuid = self.db.get_first_ref('obd')
2867 obd = self.db.lookup(self.obd_uuid)
2868 self.uuid = generate_client_uuid(self.name)
2869 self.osc = VOSC(obd, self.uuid, self.name)
2872 if not config.record and is_prepared(self.name):
2875 self.osc.prepare() # XXX This is so cheating. -p
2876 self.info(self.obd_uuid)
2878 lctl.newdev("echo_client", self.name, self.uuid,
2879 setup = self.osc.get_name())
2882 if is_prepared(self.name):
2883 Module.cleanup(self)
2886 def add_module(self, manager):
2887 self.osc.add_module(manager)
2888 manager.add_lustre_module('obdecho', 'obdecho')
2890 def correct_level(self, level, op=None):
2893 def generate_client_uuid(name):
2894 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2896 int(random.random() * 1048576),
2897 int(random.random() * 1048576))
2898 return client_uuid[:36]
2900 class Mountpoint(Module):
2901 def __init__(self, db):
2902 Module.__init__(self, 'MTPT', db)
2903 self.path = self.db.get_val('path')
2904 self.clientoptions = self.db.get_val('clientoptions', '')
2905 self.fs_uuid = self.db.get_first_ref('filesystem')
2906 fs = self.db.lookup(self.fs_uuid)
2907 self.mds_uuid = fs.get_first_ref('lmv')
2908 if not self.mds_uuid:
2909 self.mds_uuid = fs.get_first_ref('mds')
2910 self.obd_uuid = fs.get_first_ref('obd')
2911 self.gks_uuid = fs.get_first_ref('gks')
2912 client_uuid = generate_client_uuid(self.name)
2914 self.oss_sec = self.db.get_val('oss_sec','null')
2915 self.mds_sec = self.db.get_val('mds_sec','null')
2917 self.mds_sec = config.mds_sec
2919 self.oss_sec = config.oss_sec
2921 self.oss_sec = self.db.get_val('oss_sec','null')
2922 self.mds_sec = self.db.get_val('mds_sec','null')
2924 self.mds_sec = config.mds_sec
2926 self.oss_sec = config.oss_sec
2928 ost = self.db.lookup(self.obd_uuid)
2930 panic("no ost: ", self.obd_uuid)
2932 mds = self.db.lookup(self.mds_uuid)
2934 panic("no mds: ", self.mds_uuid)
2936 self.vosc = VOSC(ost, client_uuid, self.name, self.name)
2937 self.vmdc = VMDC(mds, client_uuid, self.name, self.name)
2940 self.gkc = get_gkc(db, client_uuid, self.name, self.gks_uuid)
2943 if not config.record and fs_is_mounted(self.path):
2944 log(self.path, "already mounted.")
2953 self.info(self.path, self.mds_uuid, self.obd_uuid)
2954 if config.record or config.lctl_dump:
2956 lctl.mount_option(local_node_name, self.vosc.get_name(),
2957 self.vmdc.get_name(), self.gkc.get_name())
2959 lctl.mount_option(local_node_name, self.vosc.get_name(),
2960 self.vmdc.get_name(), "")
2963 if config.clientoptions:
2964 if self.clientoptions:
2965 self.clientoptions = self.clientoptions + ',' + config.clientoptions
2967 self.clientoptions = config.clientoptions
2968 if self.clientoptions:
2969 self.clientoptions = ',' + self.clientoptions
2970 # Linux kernel will deal with async and not pass it to ll_fill_super,
2971 # so replace it with Lustre async
2972 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
2975 gkc_name = self.gkc.get_name();
2978 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,gkc=%s,mds_sec=%s,oss_sec=%s%s %s %s" % \
2979 (self.vosc.get_name(), self.vmdc.get_name(), gkc_name, self.mds_sec,
2980 self.oss_sec, self.clientoptions, config.config, self.path)
2981 log("mount -t lustre_lite -o osc=%s,mdc=%s,gkc=%s,mds_sec=%s,oss_sec=%s%s %s %s" % \
2982 (self.vosc.get_name(), self.vmdc.get_name(), gkc_name, self.mds_sec,
2983 self.oss_sec, self.clientoptions, config.config, self.path))
2984 run("mkdir", self.path)
2989 panic("mount failed:", self.path, ":", string.join(val))
2992 self.info(self.path, self.mds_uuid,self.obd_uuid)
2994 if config.record or config.lctl_dump:
2995 lctl.del_mount_option(local_node_name)
2997 if fs_is_mounted(self.path):
2999 (rc, out) = run("umount", "-f", self.path)
3001 (rc, out) = run("umount", self.path)
3003 raise CommandError('umount', out, rc)
3005 if fs_is_mounted(self.path):
3006 panic("fs is still mounted:", self.path)
3013 def add_module(self, manager):
3014 self.vosc.add_module(manager)
3015 self.vmdc.add_module(manager)
3016 manager.add_lustre_module('llite', 'llite')
3018 manager.add_lustre_module('sec/gks', 'gkc')
3020 def correct_level(self, level, op=None):
3023 # ============================================================
3024 # misc query functions
3026 def get_ost_net(self, osd_uuid):
3030 osd = self.lookup(osd_uuid)
3031 node_uuid = osd.get_first_ref('node')
3032 node = self.lookup(node_uuid)
3034 panic("unable to find node for osd_uuid:", osd_uuid,
3035 " node_ref:", node_uuid)
3036 for net_uuid in node.get_networks():
3037 db = node.lookup(net_uuid)
3038 srv_list.append(Network(db))
3041 # the order of iniitailization is based on level.
3042 def getServiceLevel(self):
3043 type = self.get_class()
3045 if type in ('network',):
3047 elif type in ('routetbl',):
3049 elif type in ('ldlm',):
3051 elif type in ('osd',):
3053 elif type in ('mdsdev',):
3055 elif type in ('lmv', 'cobd',):
3057 elif type in ('gkd',):
3059 elif type in ('cmobd', 'cobd',):
3061 elif type in ('mountpoint', 'echoclient'):
3064 panic("Unknown type: ", type)
3066 if ret < config.minlevel or ret > config.maxlevel:
3071 # return list of services in a profile. list is a list of tuples
3072 # [(level, db_object),]
3073 def getServices(self):
3075 for ref_class, ref_uuid in self.get_all_refs():
3076 servdb = self.lookup(ref_uuid)
3078 level = getServiceLevel(servdb)
3080 list.append((level, servdb))
3082 panic('service not found: ' + ref_uuid)
3088 ############################################################
3090 # FIXME: clean this mess up!
3092 # OSC is no longer in the xml, so we have to fake it.
3093 # this is getting ugly and begging for another refactoring
3094 def get_osc(db, ost_uuid, fs_name):
3095 osc = OSC(db, ost_uuid, fs_name)
3098 def get_mdc(db, mdt_uuid, fs_name):
3099 mdt_db = db.lookup(mdt_uuid);
3101 error("no mdt:", mdt_uuid)
3102 mdc = MDC(mdt_db, mdt_uuid, fs_name)
3105 def get_gkc(db, uuid, fs_name, gks_uuid):
3106 gks_db = db.lookup(gks_uuid);
3108 error("no gks:", gks_uuid)
3109 gkc = GKC(gks_db, uuid, fs_name)
3112 ############################################################
3113 # routing ("rooting")
3115 # list of (nettype, cluster_id, nid)
3118 def find_local_clusters(node_db):
3119 global local_clusters
3120 for netuuid in node_db.get_networks():
3121 net = node_db.lookup(netuuid)
3123 debug("add_local", netuuid)
3124 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
3126 if not acceptors.has_key(srv.port):
3127 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
3129 # This node is a gateway.
3131 def node_is_router():
3134 # If there are any routers found in the config, then this will be true
3135 # and all nodes will load kptlrouter.
3137 def node_needs_router():
3138 return needs_router or is_router
3140 # list of (nettype, gw, tgt_cluster_id, lo, hi)
3141 # Currently, these local routes are only added to kptlrouter route
3142 # table if they are needed to connect to a specific server. This
3143 # should be changed so all available routes are loaded, and the
3144 # ptlrouter can make all the decisions.
3147 def find_local_routes(lustre):
3148 """ Scan the lustre config looking for routers . Build list of
3150 global local_routes, needs_router
3152 list = lustre.lookup_class('node')
3154 if router.get_val_int('router', 0):
3156 for (local_type, local_cluster_id, local_nid) in local_clusters:
3158 for netuuid in router.get_networks():
3159 db = router.lookup(netuuid)
3160 if (local_type == db.get_val('nettype') and
3161 local_cluster_id == db.get_val('clusterid')):
3162 gw = db.get_val('nid')
3165 debug("find_local_routes: gw is", gw)
3166 for route in router.get_local_routes(local_type, gw):
3167 local_routes.append(route)
3168 debug("find_local_routes:", local_routes)
3171 def choose_local_server(srv_list):
3172 for srv in srv_list:
3173 if local_cluster(srv.net_type, srv.cluster_id):
3176 def local_cluster(net_type, cluster_id):
3177 for cluster in local_clusters:
3178 if net_type == cluster[0] and cluster_id == cluster[1]:
3182 def local_interface(net_type, cluster_id, nid):
3183 for cluster in local_clusters:
3184 if (net_type == cluster[0] and cluster_id == cluster[1]
3185 and nid == cluster[2]):
3189 def find_route(srv_list):
3191 frm_type = local_clusters[0][0]
3192 for srv in srv_list:
3193 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
3194 to_type = srv.net_type
3196 cluster_id = srv.cluster_id
3197 debug ('looking for route to', to_type, to)
3198 for r in local_routes:
3199 debug("find_route: ", r)
3200 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
3201 result.append((srv, r))
3204 def get_active_target(db):
3205 target_uuid = db.getUUID()
3206 target_name = db.getName()
3207 node_name = get_select(target_name)
3209 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
3211 tgt_dev_uuid = db.get_first_ref('active')
3214 def get_server_by_nid_uuid(db, nid_uuid):
3215 for n in db.lookup_class("network"):
3217 if net.nid_uuid == nid_uuid:
3221 ############################################################
3225 type = db.get_class()
3226 debug('Service:', type, db.getName(), db.getUUID())
3231 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
3232 elif type == 'network':
3234 elif type == 'routetbl':
3238 elif type == 'cobd':
3239 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
3240 elif type == 'cmobd':
3242 elif type == 'mdsdev':
3244 elif type == 'mountpoint':
3246 elif type == 'echoclient':
3253 panic ("unknown service type:", type)
3257 # Prepare the system to run lustre using a particular profile
3258 # in a the configuration.
3259 # * load & the modules
3260 # * setup networking for the current node
3261 # * make sure partitions are in place and prepared
3262 # * initialize devices with lctl
3263 # Levels is important, and needs to be enforced.
3264 def for_each_profile(db, prof_list, operation):
3265 for prof_uuid in prof_list:
3266 prof_db = db.lookup(prof_uuid)
3268 panic("profile:", prof_uuid, "not found.")
3269 services = getServices(prof_db)
3272 def get_fs_name(db, rec, tag, uuid):
3273 # FIXME: better way to find the mountpoint?
3274 filesystems = db.root_node.getElementsByTagName('filesystem')
3276 for fs in filesystems:
3277 ref = fs.getElementsByTagName(tag)
3278 if ref[0].getAttribute('uuidref') == uuid:
3279 fsuuid = fs.getAttribute('uuid')
3283 panic("malformed xml: uuid '" + uuid + "' referenced in '" + \
3284 rec.nodeName + "' record is not used by any filesystems.")
3286 mtpts = db.root_node.getElementsByTagName('mountpoint')
3289 ref = fs.getElementsByTagName('filesystem_ref')
3290 if ref[0].getAttribute('uuidref') == fsuuid:
3291 fs_name = fs.getAttribute('name')
3295 panic("malformed xml: '" + rec.nodeName + \
3296 "' record references uuid '" + uuid + \
3297 "', which references filesystem uuid '" + fsuuid + \
3298 "', which does not reference a mountpoint.")
3302 def magic_get_osc(db, rec, lov):
3304 lov_uuid = lov.get_uuid()
3305 fs_name = lov.osc.fs_name
3306 lov_name = lov.osc.name
3308 lov_uuid = rec.getAttribute('lov_uuidref')
3309 fs_name = get_fs_name(db, rec, 'obd_ref', lov_uuid)
3310 lov_name = "lov_" + fs_name
3312 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
3314 ost_uuid = rec.getAttribute('ost_uuidref')
3316 if rec.nodeName == 'lov_delete':
3318 # Use the update as a subtree in case a new OST is created with the
3319 # same name as the one that we deleted or other info about the OSS
3320 # has changed since the delete.
3321 # XXX - Not sure if this is the way this is supposed to be done.
3323 info = rec.parentNode.getElementsByTagName('info')
3325 print "delete record missing info !"
3326 tgtdb = Lustre.LustreDB_XML(info[0], info[0])
3330 obd = tgtdb.lookup(ost_uuid)
3332 panic("malformed xml: '" + rec.nodeName + \
3333 "' record references ost uuid '" + ost_uuid + \
3334 "' which cannot be found.")
3335 osc = get_osc(obd, lov_uuid, fs_name)
3337 panic('osc not found:', obd_uuid)
3338 return lov_name, lov_uuid, osc
3340 def magic_get_mdc(db, rec, lmv):
3342 lmv_uuid = lmv.mdc.uuid
3343 fs_name = lmv.mdc.fs_name
3344 lmv_name = lmv.mdc.name
3346 lmv_uuid = rec.getAttribute('lmv_uuidref')
3347 fs_name = get_fs_name(db, rec, 'mds_ref', lmv_uuid)
3348 lmv_name = "lmv_" + fs_name
3350 mdt_uuid = rec.getAttribute('mdt_uuidref')
3352 mds = db.lookup(mdt_uuid)
3355 panic("MDS not found!")
3357 mdc = MDC(mds, lmv_uuid, fs_name)
3359 panic('mdc not found:', mdt_uuid)
3360 return lmv_name, lmv_uuid, mdc
3362 # write logs for update records. sadly, logs of all types -- and updates in
3363 # particular -- are something of an afterthought. lconf needs rewritten with
3364 # these as core concepts. so this is a pretty big hack.
3365 def process_update_record(db, update, lmv, lov):
3366 for rec in update.childNodes:
3367 if rec.nodeType != rec.ELEMENT_NODE:
3370 if rec.nodeName == 'info':
3373 log("found " + rec.nodeName + " record in update version " +
3374 str(update.getAttribute('version')))
3376 if rec.nodeName == 'lmv_add':
3377 lmv_uuid = rec.getAttribute('lmv_uuidref')
3378 mdt_uuid = rec.getAttribute('mdt_uuidref')
3379 if not lmv_uuid or not mdt_uuid:
3380 panic("malformed xml: '" + rec.nodeName + \
3381 "' record requires lmv_uuid and mdt_uuid.")
3383 lmv_name, lmv_uuid, mdc = magic_get_mdc(db, rec, lmv)
3386 # Only ignore connect failures with --force, which
3387 # isn't implemented here yet.
3388 mdc.prepare(ignore_connect_failure=0)
3389 except CommandError, e:
3390 print "Error preparing MDC %s\n" % osc.uuid
3393 lctl.lmv_add_mdc(lmv_name, mdt_uuid)
3396 if rec.nodeName != 'lov_add' and rec.nodeName != 'lov_delete' and \
3397 rec.nodeName != 'lov_deactivate':
3398 panic("unrecognized update record type '" + rec.nodeName + "'.")
3400 lov_uuid = rec.getAttribute('lov_uuidref')
3401 ost_uuid = rec.getAttribute('ost_uuidref')
3402 index = rec.getAttribute('index')
3403 gen = rec.getAttribute('generation')
3405 if not lov_uuid or not ost_uuid or not index or not gen:
3406 panic("malformed xml: '" + rec.nodeName + "' record requires lov_uuid, ost_uuid, index, and generation.")
3408 lov_name, lov_uuid, osc = magic_get_osc(db, rec, lov)
3410 # ------------------------------------------------------------- add
3411 if rec.nodeName == 'lov_add':
3413 # Only ignore connect failures with --force, which
3414 # isn't implemented here yet.
3415 osc.prepare(ignore_connect_failure=0)
3416 except CommandError, e:
3417 print "Error preparing OSC %s\n" % osc.uuid
3420 lctl.lov_add_osc(lov_name, ost_uuid, index, gen)
3422 # ------------------------------------------------------ deactivate
3423 elif rec.nodeName == 'lov_deactivate':
3426 except CommandError, e:
3427 print "Error deactivating OSC %s\n" % osc.uuid
3430 # ---------------------------------------------------------- delete
3431 elif rec.nodeName == 'lov_delete':
3432 lctl.lov_del_osc(lov_name, ost_uuid, index, gen)
3438 except CommandError, e:
3439 print "Error cleaning up OSC %s\n" % osc.uuid
3442 def process_updates(db, log_device, log_name, lmv = None, lov = None):
3443 if not config.write_conf and not config.record:
3448 updates = db.root_node.getElementsByTagName('update')
3450 if not u.childNodes:
3451 log("ignoring empty update record (version " +
3452 str(u.getAttribute('version')) + ")")
3455 version = u.getAttribute('version')
3456 real_name = "%s-%s" % (log_name, version)
3457 lctl.clear_log(log_device, real_name)
3458 lctl.record(log_device, real_name)
3460 process_update_record(db, u, lmv, lov)
3464 def doWriteconf(services):
3466 if s[1].get_class() == 'mdsdev' or s[1].get_class() == 'osd':
3467 n = newService(s[1])
3469 if not config.nosetup:
3472 def doSetup(services):
3477 n = newService(s[1])
3479 slist.append((n.level, n))
3482 nl = n[1].correct_level(n[0])
3483 nlist.append((nl, n[1]))
3487 lctl.clear_log(config.record_device, config.record_log)
3488 lctl.record(config.record_device, config.record_log)
3490 # ugly hack, only need to run lctl commands for --dump
3491 if config.lctl_dump or config.record:
3492 sys_set_timeout(timeout)
3493 sys_set_lustre_upcall(lustre_upcall)
3497 if config.record and n[1].module_name == 'MTPT':
3503 process_updates(n[1].db, config.record_device, config.record_log,
3506 def doLoadModules(services):
3510 # adding all needed modules from all services
3512 n = newService(s[1])
3513 n.add_module(mod_manager)
3515 # loading all registered modules
3516 mod_manager.load_modules()
3518 def doUnloadModules(services):
3522 # adding all needed modules from all services
3524 n = newService(s[1])
3525 if n.safe_to_clean_modules():
3526 n.add_module(mod_manager)
3528 # unloading all registered modules
3529 mod_manager.cleanup_modules()
3531 def doCleanup(services):
3537 n = newService(s[1])
3539 slist.append((n.level, n))
3542 nl = n[1].correct_level(n[0])
3543 nlist.append((nl, n[1]))
3548 lctl.clear_log(config.record_device, config.record_log)
3549 lctl.record(config.record_device, config.record_log)
3552 if n[1].safe_to_clean():
3560 def doHost(lustreDB, hosts):
3561 global is_router, local_node_name, lustre_upcall, timeout
3564 node_db = lustreDB.lookup_name(h, 'node')
3568 panic('No host entry found.')
3570 local_node_name = node_db.get_val('name', 0)
3571 is_router = node_db.get_val_int('router', 0)
3572 lustre_upcall = node_db.get_val('lustreUpcall', '')
3573 portals_upcall = node_db.get_val('portalsUpcall', '')
3574 timeout = node_db.get_val_int('timeout', 0)
3575 ptldebug = node_db.get_val('ptldebug', '')
3576 subsystem = node_db.get_val('subsystem', '')
3578 find_local_clusters(node_db)
3580 find_local_routes(lustreDB)
3582 # Two step process: (1) load modules, (2) setup lustre
3583 # if not cleaning, load modules first.
3584 prof_list = node_db.get_refs('profile')
3586 if config.write_conf:
3587 for_each_profile(node_db, prof_list, doLoadModules)
3589 for_each_profile(node_db, prof_list, doWriteconf)
3590 for_each_profile(node_db, prof_list, doUnloadModules)
3593 elif config.recover:
3594 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3595 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3596 "--client_uuid <UUID> --conn_uuid <UUID>")
3597 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3599 elif config.cleanup:
3601 # the command line can override this value
3603 # ugly hack, only need to run lctl commands for --dump
3604 if config.lctl_dump or config.record:
3605 for_each_profile(node_db, prof_list, doCleanup)
3608 sys_set_timeout(timeout)
3609 sys_set_ptldebug(ptldebug)
3610 sys_set_subsystem(subsystem)
3611 sys_set_lustre_upcall(lustre_upcall)
3612 sys_set_portals_upcall(portals_upcall)
3614 for_each_profile(node_db, prof_list, doCleanup)
3615 for_each_profile(node_db, prof_list, doUnloadModules)
3619 # ugly hack, only need to run lctl commands for --dump
3620 if config.lctl_dump or config.record:
3621 for_each_profile(node_db, prof_list, doSetup)
3625 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3626 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3628 for_each_profile(node_db, prof_list, doLoadModules)
3630 sys_set_debug_path()
3631 sys_set_ptldebug(ptldebug)
3632 sys_set_subsystem(subsystem)
3633 script = config.gdb_script
3634 run(lctl.lctl, ' modules >', script)
3636 log ("The GDB module script is in", script)
3637 # pause, so user has time to break and
3640 sys_set_timeout(timeout)
3641 sys_set_lustre_upcall(lustre_upcall)
3642 sys_set_portals_upcall(portals_upcall)
3644 for_each_profile(node_db, prof_list, doSetup)
3647 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3648 tgt = lustreDB.lookup(tgt_uuid)
3650 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3651 new_uuid = get_active_target(tgt)
3653 raise Lustre.LconfError("doRecovery: no active target found for: " +
3655 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3657 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3659 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3661 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3664 lctl.disconnect(oldnet)
3665 except CommandError, e:
3666 log("recover: disconnect", nid_uuid, "failed: ")
3671 except CommandError, e:
3672 log("recover: connect failed")
3675 lctl.recover(client_uuid, net.nid_uuid)
3678 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3679 base = os.path.dirname(cmd)
3680 if development_mode():
3681 if not config.lustre:
3682 debug('using objdir module paths')
3683 config.lustre = (os.path.join(base, ".."))
3684 # normalize the portals dir, using command line arg if set
3686 portals_dir = config.portals
3687 dir = os.path.join(config.lustre, portals_dir)
3688 config.portals = dir
3689 debug('config.portals', config.portals)
3690 elif config.lustre and config.portals:
3692 # if --lustre and --portals, normalize portals
3693 # can ignore POTRALS_DIR here, since it is probly useless here
3694 config.portals = os.path.join(config.lustre, config.portals)
3695 debug('config.portals B', config.portals)
3697 def sysctl(path, val):
3698 debug("+ sysctl", path, val)
3702 fp = open(os.path.join('/proc/sys', path), 'w')
3708 def sys_set_debug_path():
3709 sysctl('portals/debug_path', config.debug_path)
3711 def sys_set_lustre_upcall(upcall):
3712 # the command overrides the value in the node config
3713 if config.lustre_upcall:
3714 upcall = config.lustre_upcall
3716 upcall = config.upcall
3718 lctl.set_lustre_upcall(upcall)
3720 def sys_set_portals_upcall(upcall):
3721 # the command overrides the value in the node config
3722 if config.portals_upcall:
3723 upcall = config.portals_upcall
3725 upcall = config.upcall
3727 sysctl('portals/upcall', upcall)
3729 def sys_set_timeout(timeout):
3730 # the command overrides the value in the node config
3731 if config.timeout and config.timeout > 0:
3732 timeout = config.timeout
3733 if timeout != None and timeout > 0:
3734 lctl.set_timeout(timeout)
3736 def sys_tweak_socknal ():
3737 # reserve at least 8MB, or we run out of RAM in skb_alloc under read
3738 if sys_get_branch() == '2.6':
3739 fp = open('/proc/meminfo')
3740 lines = fp.readlines()
3745 if a[0] == 'MemTotal:':
3747 debug("memtotal" + memtotal)
3748 if int(memtotal) < 262144:
3749 minfree = int(memtotal) / 16
3752 debug("+ minfree ", minfree)
3753 sysctl("vm/min_free_kbytes", minfree)
3754 if config.single_socket:
3755 sysctl("socknal/typed", 0)
3757 def sys_optimize_elan ():
3758 procfiles = ["/proc/elan/config/eventint_punt_loops",
3759 "/proc/qsnet/elan3/config/eventint_punt_loops",
3760 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3762 if os.access(p, os.W_OK):
3763 run ("echo 1 > " + p)
3765 def sys_set_ptldebug(ptldebug):
3767 ptldebug = config.ptldebug
3770 val = eval(ptldebug, ptldebug_names)
3771 val = "0x%x" % (val & 0xffffffffL)
3772 sysctl('portals/debug', val)
3773 except NameError, e:
3776 def sys_set_subsystem(subsystem):
3777 if config.subsystem:
3778 subsystem = config.subsystem
3781 val = eval(subsystem, subsystem_names)
3782 val = "0x%x" % (val & 0xffffffffL)
3783 sysctl('portals/subsystem_debug', val)
3784 except NameError, e:
3787 def sys_set_netmem_max(path, max):
3788 debug("setting", path, "to at least", max)
3796 fp = open(path, 'w')
3797 fp.write('%d\n' %(max))
3800 def sys_make_devices():
3801 if not os.access('/dev/portals', os.R_OK):
3802 run('mknod /dev/portals c 10 240')
3803 if not os.access('/dev/obd', os.R_OK):
3804 run('mknod /dev/obd c 10 241')
3806 # Add dir to the global PATH, if not already there.
3807 def add_to_path(new_dir):
3808 syspath = string.split(os.environ['PATH'], ':')
3809 if new_dir in syspath:
3811 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3813 def default_debug_path():
3814 path = '/tmp/lustre-log'
3815 if os.path.isdir('/r'):
3820 def default_gdb_script():
3821 script = '/tmp/ogdb'
3822 if os.path.isdir('/r'):
3823 return '/r' + script
3827 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3828 # ensure basic elements are in the system path
3829 def sanitise_path():
3830 for dir in DEFAULT_PATH:
3833 # global hack for the --select handling
3835 def init_select(args):
3836 # args = [service=nodeA,service2=nodeB service3=nodeC]
3839 list = string.split(arg, ',')
3841 srv, node = string.split(entry, '=')
3842 tgt_select[srv] = node
3844 def get_select(srv):
3845 if tgt_select.has_key(srv):
3846 return tgt_select[srv]
3850 FLAG = Lustre.Options.FLAG
3851 PARAM = Lustre.Options.PARAM
3852 INTPARAM = Lustre.Options.INTPARAM
3853 PARAMLIST = Lustre.Options.PARAMLIST
3855 ('verbose,v', "Print system commands as they are run"),
3856 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3857 ('config', "Cluster config name used for LDAP query", PARAM),
3858 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3859 ('node', "Load config for <nodename>", PARAM),
3860 ('sec',"security flavor <null|krb5i|krb5p> between this client with mds", PARAM),
3861 ('mds_sec',"security flavor <null|krb5i|krb5p> between this client with mds", PARAM),
3862 ('oss_sec',"security flavor <null|krb5i|krb5p> between this client with ost", PARAM),
3863 ('mds_mds_sec',"security flavor <null|krb5i|krb5p> between this mds with other mds", PARAM),
3864 ('mds_oss_sec',"security flavor <null|krb5i|krb5p> between this mds with ost", PARAM),
3865 ('mds_deny_sec', "security flavor <null|krb5i|krb5p> denied by this mds", PARAM),
3866 ('ost_deny_sec', "security flavor <null|krb5i|krb5p> denied by this ost", PARAM),
3867 ('cleanup,d', "Cleans up config. (Shutdown)"),
3868 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3870 ('single_socket', "socknal option: only use one socket instead of bundle",
3872 ('failover',"""Used to shut down without saving state.
3873 This will allow this node to "give up" a service to a
3874 another node for failover purposes. This will not
3875 be a clean shutdown.""",
3877 ('gdb', """Prints message after creating gdb module script
3878 and sleeps for 5 seconds."""),
3879 ('noexec,n', """Prints the commands and steps that will be run for a
3880 config without executing them. This can used to check if a
3881 config file is doing what it should be doing"""),
3882 ('nomod', "Skip load/unload module step."),
3883 ('nosetup', "Skip device setup/cleanup step."),
3884 ('reformat', "Reformat all devices (without question)"),
3885 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3886 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3887 ('clientoptions', "Additional options for Lustre", PARAM),
3888 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3890 ('write_conf', "Save all the client config information on mds."),
3891 ('record', "Write config information on mds."),
3892 ('record_log', "Name of config record log.", PARAM),
3893 ('record_device', "MDS device name that will record the config commands",
3895 ('root_squash', "MDS squash root to appointed uid",
3897 ('no_root_squash', "Don't squash root for appointed nid",
3899 ('minlevel', "Minimum level of services to configure/cleanup",
3901 ('maxlevel', """Maximum level of services to configure/cleanup
3902 Levels are aproximatly like:
3907 70 - mountpoint, echo_client, osc, mdc, lov""",
3909 ('lustre', """Base directory of lustre sources. This parameter will
3910 cause lconf to load modules from a source tree.""", PARAM),
3911 ('portals', """Portals source directory. If this is a relative path,
3912 then it is assumed to be relative to lustre. """, PARAM),
3913 ('timeout', "Set recovery timeout", INTPARAM),
3914 ('upcall', "Set both portals and lustre upcall script", PARAM),
3915 ('lustre_upcall', "Set lustre upcall script", PARAM),
3916 ('portals_upcall', "Set portals upcall script", PARAM),
3917 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3918 ('ptldebug', "Set the portals debug level", PARAM),
3919 ('subsystem', "Set the portals debug subsystem", PARAM),
3920 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3921 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3922 # Client recovery options
3923 ('recover', "Recover a device"),
3924 ('group', "The group of devices to configure or cleanup", PARAM),
3925 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3926 ('client_uuid', "The failed client (required for recovery)", PARAM),
3927 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3929 ('inactive', """The name of an inactive service, to be ignored during
3930 mounting (currently OST-only). Can be repeated.""",
3935 global lctl, config, toplustreDB, CONFIG_FILE, mod_manager
3937 # in the upcall this is set to SIG_IGN
3938 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3940 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3942 config, args = cl.parse(sys.argv[1:])
3943 except Lustre.OptionError, e:
3947 setupModulePath(sys.argv[0])
3949 host = socket.gethostname()
3951 # the PRNG is normally seeded with time(), which is not so good for starting
3952 # time-synchronized clusters
3953 input = open('/dev/urandom', 'r')
3955 print 'Unable to open /dev/urandom!'
3957 seed = input.read(32)
3963 init_select(config.select)
3966 # allow config to be fetched via HTTP, but only with python2
3967 if sys.version[0] != '1' and args[0].startswith('http://'):
3970 config_file = urllib2.urlopen(args[0])
3971 except (urllib2.URLError, socket.error), err:
3972 if hasattr(err, 'args'):
3974 print "Could not access '%s': %s" %(args[0], err)
3976 elif not os.access(args[0], os.R_OK):
3977 print 'File not found or readable:', args[0]
3981 config_file = open(args[0], 'r')
3983 dom = xml.dom.minidom.parse(config_file)
3985 panic("%s does not appear to be a config file." % (args[0]))
3986 sys.exit(1) # make sure to die here, even in debug mode.
3988 CONFIG_FILE = args[0]
3989 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3990 if not config.config:
3991 config.config = os.path.basename(args[0])# use full path?
3992 if config.config[-4:] == '.xml':
3993 config.config = config.config[:-4]
3994 elif config.ldapurl:
3995 if not config.config:
3996 panic("--ldapurl requires --config name")
3997 dn = "config=%s,fs=lustre" % (config.config)
3998 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3999 elif config.ptldebug or config.subsystem:
4000 sys_set_ptldebug(None)
4001 sys_set_subsystem(None)
4004 print 'Missing config file or ldap URL.'
4005 print 'see lconf --help for command summary'
4008 toplustreDB = lustreDB
4010 ver = lustreDB.get_version()
4012 panic("No version found in config data, please recreate.")
4013 if ver != Lustre.CONFIG_VERSION:
4014 panic("Config version", ver, "does not match lconf version",
4015 Lustre.CONFIG_VERSION)
4019 node_list.append(config.node)
4022 node_list.append(host)
4023 node_list.append('localhost')
4025 debug("configuring for host: ", node_list)
4028 config.debug_path = config.debug_path + '-' + host
4029 config.gdb_script = config.gdb_script + '-' + host
4031 lctl = LCTLInterface('lctl')
4033 if config.lctl_dump:
4034 lctl.use_save_file(config.lctl_dump)
4037 if not (config.record_device and config.record_log):
4038 panic("When recording, both --record_log and --record_device must be specified.")
4040 # init module manager
4041 mod_manager = kmod_manager(config.lustre, config.portals)
4043 doHost(lustreDB, node_list)
4047 if __name__ == "__main__":
4050 except Lustre.LconfError, e:
4052 # traceback.print_exc(file=sys.stdout)
4054 except CommandError, e:
4058 if first_cleanup_error:
4059 sys.exit(first_cleanup_error)