3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = '../portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
93 "console" : (1 << 25),
99 "undefined" : (1 << 0),
104 "obdclass" : (1 << 5),
109 "portals" : (1 << 10),
111 "pinger" : (1 << 12),
112 "filter" : (1 << 13),
117 "ptlrouter" : (1 << 18),
121 "confobd" : (1 << 22),
128 first_cleanup_error = 0
129 def cleanup_error(rc):
130 global first_cleanup_error
131 if not first_cleanup_error:
132 first_cleanup_error = rc
134 # ============================================================
135 # debugging and error funcs
137 def fixme(msg = "this feature"):
138 raise Lustre.LconfError, msg + ' not implemented yet.'
141 msg = string.join(map(str,args))
142 if not config.noexec:
143 raise Lustre.LconfError(msg)
148 msg = string.join(map(str,args))
153 print string.strip(s)
157 msg = string.join(map(str,args))
160 # ack, python's builtin int() does not support '0x123' syntax.
161 # eval can do it, although what a hack!
165 return eval(s, {}, {})
168 except SyntaxError, e:
169 raise ValueError("not a number")
171 raise ValueError("not a number")
173 # ============================================================
174 # locally defined exceptions
175 class CommandError (exceptions.Exception):
176 def __init__(self, cmd_name, cmd_err, rc=None):
177 self.cmd_name = cmd_name
178 self.cmd_err = cmd_err
183 if type(self.cmd_err) == types.StringType:
185 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
187 print "! %s: %s" % (self.cmd_name, self.cmd_err)
188 elif type(self.cmd_err) == types.ListType:
190 print "! %s (error %d):" % (self.cmd_name, self.rc)
192 print "! %s:" % (self.cmd_name)
193 for s in self.cmd_err:
194 print "> %s" %(string.strip(s))
199 # ============================================================
200 # handle daemons, like the acceptor
202 """ Manage starting and stopping a daemon. Assumes daemon manages
203 it's own pid file. """
205 def __init__(self, cmd):
211 log(self.command, "already running.")
213 self.path = find_prog(self.command)
215 panic(self.command, "not found.")
216 ret, out = runcmd(self.path +' '+ self.command_line())
218 raise CommandError(self.path, out, ret)
222 pid = self.read_pidfile()
225 log ("killing process", pid)
228 log("was unable to find pid of " + self.command)
230 log("unable to kill", self.command, e)
231 time.sleep(5) # let daemon die
233 log("unable to kill", self.command)
236 pid = self.read_pidfile()
242 log("was unable to find pid of " + self.command)
249 def read_pidfile(self):
251 fp = open(self.pidfile(), 'r')
261 def clean_pidfile(self):
262 """ Remove a stale pidfile """
263 log("removing stale pidfile:", self.pidfile())
265 os.unlink(self.pidfile())
267 log(self.pidfile(), e)
269 class AcceptorHandler(DaemonHandler):
270 def __init__(self, port, net_type):
271 DaemonHandler.__init__(self, "acceptor")
276 return "/var/run/%s-%d.pid" % (self.command, self.port)
278 def command_line(self):
279 return string.join(map(str,(self.flags, self.port)))
283 # start the acceptors
285 if config.lctl_dump or config.record:
287 for port in acceptors.keys():
288 daemon = acceptors[port]
289 if not daemon.running():
292 def run_one_acceptor(port):
293 if config.lctl_dump or config.record:
295 if acceptors.has_key(port):
296 daemon = acceptors[port]
297 if not daemon.running():
300 panic("run_one_acceptor: No acceptor defined for port:", port)
302 def stop_acceptor(port):
303 if acceptors.has_key(port):
304 daemon = acceptors[port]
309 # ============================================================
310 # handle lctl interface
313 Manage communication with lctl
316 def __init__(self, cmd):
318 Initialize close by finding the lctl binary.
320 self.lctl = find_prog(cmd)
322 self.record_device = ''
325 debug('! lctl not found')
328 raise CommandError('lctl', "unable to find lctl binary.")
330 def use_save_file(self, file):
331 self.save_file = file
333 def record(self, dev_name, logname):
334 log("Recording log", logname, "on", dev_name)
335 self.record_device = dev_name
336 self.record_log = logname
338 def end_record(self):
339 log("End recording log", self.record_log, "on", self.record_device)
340 self.record_device = None
341 self.record_log = None
343 def set_nonblock(self, fd):
344 fl = fcntl.fcntl(fd, F_GETFL)
345 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
350 the cmds are written to stdin of lctl
351 lctl doesn't return errors when run in script mode, so
353 should modify command line to accept multiple commands, or
354 create complex command line options
358 cmds = '\n dump ' + self.save_file + '\n' + cmds
359 elif self.record_device:
363 %s""" % (self.record_device, self.record_log, cmds)
365 debug("+", cmd_line, cmds)
366 if config.noexec: return (0, [])
368 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
369 child.tochild.write(cmds + "\n")
370 child.tochild.close()
371 # print "LCTL:", cmds
373 # From "Python Cookbook" from O'Reilly
374 outfile = child.fromchild
375 outfd = outfile.fileno()
376 self.set_nonblock(outfd)
377 errfile = child.childerr
378 errfd = errfile.fileno()
379 self.set_nonblock(errfd)
381 outdata = errdata = ''
384 ready = select.select([outfd,errfd],[],[]) # Wait for input
385 if outfd in ready[0]:
386 outchunk = outfile.read()
387 if outchunk == '': outeof = 1
388 outdata = outdata + outchunk
389 if errfd in ready[0]:
390 errchunk = errfile.read()
391 if errchunk == '': erreof = 1
392 errdata = errdata + errchunk
393 if outeof and erreof: break
394 # end of "borrowed" code
397 if os.WIFEXITED(ret):
398 rc = os.WEXITSTATUS(ret)
401 if rc or len(errdata):
402 raise CommandError(self.lctl, errdata, rc)
405 def runcmd(self, *args):
407 run lctl using the command line
409 cmd = string.join(map(str,args))
410 debug("+", self.lctl, cmd)
411 rc, out = run(self.lctl, cmd)
413 raise CommandError(self.lctl, out, rc)
416 def clear_log(self, dev, log):
417 """ clear an existing log """
422 quit """ % (dev, log)
425 def root_squash(self, name, uid, nid):
429 quit""" % (name, uid, nid)
432 def network(self, net, nid):
437 quit """ % (net, nid)
441 def add_interface(self, net, ip, netmask = ""):
442 """ add an interface """
446 quit """ % (net, ip, netmask)
449 # delete an interface
450 def del_interface(self, net, ip):
451 """ delete an interface """
458 # create a new connection
459 def add_uuid(self, net_type, uuid, nid):
460 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
463 def add_peer(self, net_type, nid, hostaddr, port):
464 if net_type in ('tcp','openib','ra') and not config.lctl_dump:
469 nid, hostaddr, port )
471 elif net_type in ('iib',) and not config.lctl_dump:
478 elif net_type in ('vib',) and not config.lctl_dump:
486 def connect(self, srv):
487 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
488 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
490 hostaddr = string.split(srv.hostaddr[0], '/')[0]
491 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
494 def recover(self, dev_name, new_conn):
497 recover %s""" %(dev_name, new_conn)
500 # add a route to a range
501 def add_route(self, net, gw, lo, hi):
509 except CommandError, e:
513 def del_route(self, net, gw, lo, hi):
518 quit """ % (net, gw, lo, hi)
521 # add a route to a host
522 def add_route_host(self, net, uuid, gw, tgt):
523 self.add_uuid(net, uuid, tgt)
531 except CommandError, e:
535 # add a route to a range
536 def del_route_host(self, net, uuid, gw, tgt):
542 quit """ % (net, gw, tgt)
546 def del_peer(self, net_type, nid, hostaddr):
547 if net_type in ('tcp',) and not config.lctl_dump:
551 del_peer %s %s single_share
555 elif net_type in ('openib','iib','vib','ra') and not config.lctl_dump:
559 del_peer %s single_share
564 # disconnect one connection
565 def disconnect(self, srv):
566 self.del_uuid(srv.nid_uuid)
567 if srv.net_type in ('tcp','openib','iib','vib','ra') and not config.lctl_dump:
569 hostaddr = string.split(srv.hostaddr[0], '/')[0]
570 self.del_peer(srv.net_type, srv.nid, hostaddr)
572 def del_uuid(self, uuid):
580 def disconnectAll(self, net):
588 def attach(self, type, name, uuid):
591 quit""" % (type, name, uuid)
594 def detach(self, name):
601 def set_security(self, name, key, value):
605 quit""" % (name, key, value)
608 def setup(self, name, setup = ""):
612 quit""" % (name, setup)
615 def add_conn(self, name, conn_uuid):
619 quit""" % (name, conn_uuid)
622 def start(self, name, conf_name):
626 quit""" % (name, conf_name)
629 # create a new device with lctl
630 def newdev(self, type, name, uuid, setup = ""):
632 self.attach(type, name, uuid);
634 self.setup(name, setup)
635 except CommandError, e:
636 self.cleanup(name, uuid, 0)
640 def cleanup(self, name, uuid, force, failover = 0):
641 if failover: force = 1
647 quit""" % (name, ('', 'force')[force],
648 ('', 'failover')[failover])
652 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
653 stripe_sz, stripe_off, pattern):
656 lov_setup %s %d %d %d %s
657 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
660 # add an OSC to a LOV
661 def lov_add_osc(self, name, ost_uuid, index, gen):
663 lov_modify_tgts add %s %s %s %s
664 quit""" % (name, ost_uuid, index, gen)
667 # delete an OSC from a LOV
668 def lov_del_osc(self, name, ost_uuid, index, gen):
670 lov_modify_tgts del %s %s %s %s
671 quit""" % (name, ost_uuid, index, gen)
675 def deactivate(self, name):
683 def lmv_setup(self, name, uuid, desc_uuid):
687 quit""" % (name, uuid, desc_uuid)
690 # add an MDC to an LMV
691 def lmv_add_mdc(self, lmv_name, mdt_uuid):
693 lmv_modify_tgts add %s %s
694 quit""" % (lmv_name, mdt_uuid)
698 def dump(self, dump_file):
701 quit""" % (dump_file)
704 # get list of devices
705 def device_list(self):
706 devices = '/proc/fs/lustre/devices'
708 if os.access(devices, os.R_OK):
710 fp = open(devices, 'r')
718 def lustre_version(self):
719 rc, out = self.runcmd('version')
723 def mount_option(self, profile, osc, mdc, gkc):
725 mount_option %s %s %s %s
726 quit""" % (profile, osc, mdc, gkc)
729 # delete mount options
730 def del_mount_option(self, profile):
736 def set_timeout(self, timeout):
742 def set_lustre_upcall(self, upcall):
747 # ============================================================
748 # Various system-level functions
749 # (ideally moved to their own module)
751 # Run a command and return the output and status.
752 # stderr is sent to /dev/null, could use popen3 to
753 # save it if necessary
756 if config.noexec: return (0, [])
757 f = os.popen(cmd + ' 2>&1')
767 cmd = string.join(map(str,args))
770 # Run a command in the background.
771 def run_daemon(*args):
772 cmd = string.join(map(str,args))
774 if config.noexec: return 0
775 f = os.popen(cmd + ' 2>&1')
783 # Determine full path to use for an external command
784 # searches dirname(argv[0]) first, then PATH
786 syspath = string.split(os.environ['PATH'], ':')
787 cmdpath = os.path.dirname(sys.argv[0])
788 syspath.insert(0, cmdpath);
790 syspath.insert(0, os.path.join(config.portals, 'utils/'))
792 prog = os.path.join(d,cmd)
793 if os.access(prog, os.X_OK):
797 # Recursively look for file starting at base dir
798 def do_find_file(base, mod):
799 fullname = os.path.join(base, mod)
800 if os.access(fullname, os.R_OK):
802 for d in os.listdir(base):
803 dir = os.path.join(base,d)
804 if os.path.isdir(dir):
805 module = do_find_file(dir, mod)
809 # is the path a block device?
816 return stat.S_ISBLK(s[stat.ST_MODE])
818 # find the journal device from mkfs options
824 while i < len(x) - 1:
825 if x[i] == '-J' and x[i+1].startswith('device='):
831 # build fs according to type
833 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
839 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
841 # devsize is in 1k, and fs block count is in 4k
842 block_cnt = devsize/4
844 if fstype in ('ext3', 'extN', 'ldiskfs'):
845 # ext3 journal size is in megabytes
846 # but don't set jsize if mkfsoptions indicates a separate journal device
847 if jsize == 0 and jdev(mkfsoptions) == '':
849 if not is_block(dev):
850 ret, out = runcmd("ls -l %s" %dev)
851 devsize = int(string.split(out[0])[4]) / 1024
853 # sfdisk works for symlink, hardlink, and realdev
854 ret, out = runcmd("sfdisk -s %s" %dev)
856 devsize = int(out[0])
858 # sfdisk -s will fail for too large block device,
859 # then, read the size of partition from /proc/partitions
861 # get the realpath of the device
862 # it may be the real device, such as /dev/hda7
863 # or the hardlink created via mknod for a device
864 if 'realpath' in dir(os.path):
865 real_dev = os.path.realpath(dev)
869 while os.path.islink(real_dev) and (link_count < 20):
870 link_count = link_count + 1
871 dev_link = os.readlink(real_dev)
872 if os.path.isabs(dev_link):
875 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
877 panic("Entountered too many symbolic links resolving block device:", dev)
879 # get the major and minor number of the realpath via ls
880 # it seems python(os.stat) does not return
881 # the st_rdev member of the stat structure
882 ret, out = runcmd("ls -l %s" %real_dev)
883 major = string.split(string.split(out[0])[4], ",")[0]
884 minor = string.split(out[0])[5]
886 # get the devsize from /proc/partitions with the major and minor number
887 ret, out = runcmd("cat /proc/partitions")
890 if string.split(line)[0] == major and string.split(line)[1] == minor:
891 devsize = int(string.split(line)[2])
894 if devsize > 1024 * 1024:
895 jsize = ((devsize / 102400) * 4)
898 if jsize: jopt = "-J size=%d" %(jsize,)
899 if isize: iopt = "-I %d" %(isize,)
900 mkfs = 'mkfs.ext2 -j -b 4096 '
901 if not isblock or config.force:
903 if jdev(mkfsoptions) != '':
904 jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
906 jmkfs = jmkfs + '-F '
907 jmkfs = jmkfs + jdev(mkfsoptions)
908 (ret, out) = run (jmkfs)
910 panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
911 elif fstype == 'reiserfs':
912 # reiserfs journal size is in blocks
913 if jsize: jopt = "--journal_size %d" %(jsize,)
914 mkfs = 'mkreiserfs -ff'
916 panic('unsupported fs type: ', fstype)
918 if config.mkfsoptions != None:
919 mkfs = mkfs + ' ' + config.mkfsoptions
920 if mkfsoptions != None:
921 mkfs = mkfs + ' ' + mkfsoptions
922 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
924 panic("Unable to build fs:", dev, string.join(out))
925 # enable hash tree indexing on fsswe
926 if fstype in ('ext3', 'extN', 'ldiskfs'):
927 htree = 'echo "feature FEATURE_C5" | debugfs -w'
928 (ret, out) = run (htree, dev)
930 panic("Unable to enable htree:", dev)
932 # some systems use /dev/loopN, some /dev/loop/N
936 if not os.access(loop + str(0), os.R_OK):
938 if not os.access(loop + str(0), os.R_OK):
939 panic ("can't access loop devices")
942 # find loop device assigned to the file
943 def find_assigned_loop(file):
945 for n in xrange(0, MAX_LOOP_DEVICES):
947 if os.access(dev, os.R_OK):
948 (stat, out) = run('losetup', dev)
949 if out and stat == 0:
950 m = re.search(r'\((.*)\)', out[0])
951 if m and file == m.group(1):
955 # find free loop device
956 def find_free_loop(file):
959 # find next free loop
960 for n in xrange(0, MAX_LOOP_DEVICES):
962 if os.access(dev, os.R_OK):
963 (stat, out) = run('losetup', dev)
968 # create file if necessary and assign the first free loop device
969 def init_loop(file, size, fstype, journal_size, inode_size,
970 mkfsoptions, reformat, autoformat, backfstype, backfile):
973 realfstype = backfstype
974 if is_block(backfile):
975 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
976 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
982 dev = find_assigned_loop(realfile)
984 print 'WARNING: file', realfile, 'already mapped to', dev
987 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
988 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
990 panic("Unable to create backing store:", realfile)
991 mkfs(realfile, size, realfstype, journal_size, inode_size,
992 mkfsoptions, isblock=0)
994 dev = find_free_loop(realfile)
996 print "attach " + realfile + " <-> " + dev
997 run('losetup', dev, realfile)
1000 print "out of loop devices"
1003 # undo loop assignment
1004 def clean_loop(dev, fstype, backfstype, backdev):
1005 if fstype == 'smfs':
1009 if not is_block(realfile):
1010 dev = find_assigned_loop(realfile)
1012 print "detach " + dev + " <-> " + realfile
1013 ret, out = run('losetup -d', dev)
1015 log('unable to clean loop device', dev, 'for file', realfile)
1018 # finilizes passed device
1019 def clean_dev(dev, fstype, backfstype, backdev):
1020 if fstype == 'smfs' or not is_block(dev):
1021 clean_loop(dev, fstype, backfstype, backdev)
1023 # determine if dev is formatted as a <fstype> filesystem
1024 def need_format(fstype, dev):
1025 # FIXME don't know how to implement this
1028 # initialize a block device if needed
1029 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
1030 inode_size, mkfsoptions, backfstype, backdev):
1034 if fstype == 'smfs' or not is_block(dev):
1035 dev = init_loop(dev, size, fstype, journal_size, inode_size,
1036 mkfsoptions, reformat, autoformat, backfstype, backdev)
1037 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
1038 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
1041 # panic("device:", dev,
1042 # "not prepared, and autoformat is not set.\n",
1043 # "Rerun with --reformat option to format ALL filesystems")
1048 """lookup IP address for an interface"""
1049 rc, out = run("/sbin/ifconfig", iface)
1052 addr = string.split(out[1])[1]
1053 ip = string.split(addr, ':')[1]
1056 def def_mount_options(fstype, target):
1057 """returns deafult mount options for passed fstype and target (mds, ost)"""
1058 if fstype == 'ext3' or fstype == 'ldiskfs':
1059 mountfsoptions = "errors=remount-ro"
1060 if target == 'ost' and sys_get_branch() == '2.4':
1061 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
1062 if target == 'ost' and sys_get_branch() == '2.6':
1063 mountfsoptions = "%s,extents,mballoc" % (mountfsoptions)
1064 return mountfsoptions
1067 def sys_get_elan_position_file():
1068 procfiles = ["/proc/elan/device0/position",
1069 "/proc/qsnet/elan4/device0/position",
1070 "/proc/qsnet/elan3/device0/position"]
1072 if os.access(p, os.R_OK):
1076 def sys_get_local_nid(net_type, wildcard, cluster_id):
1077 """Return the local nid."""
1079 if sys_get_elan_position_file():
1080 local = sys_get_local_address('elan', '*', cluster_id)
1082 local = sys_get_local_address(net_type, wildcard, cluster_id)
1085 def sys_get_local_address(net_type, wildcard, cluster_id):
1086 """Return the local address for the network type."""
1088 if net_type in ('tcp','openib','iib','vib','ra'):
1090 iface, star = string.split(wildcard, ':')
1091 local = if2addr(iface)
1093 panic ("unable to determine ip for:", wildcard)
1095 host = socket.gethostname()
1096 local = socket.gethostbyname(host)
1097 elif net_type == 'elan':
1098 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
1099 f = sys_get_elan_position_file()
1101 panic ("unable to determine local Elan ID")
1104 lines = fp.readlines()
1108 if a[0] == 'NodeId':
1112 nid = my_int(cluster_id) + my_int(elan_id)
1113 local = "%d" % (nid)
1114 except ValueError, e:
1118 elif net_type == 'lo':
1119 fixme("automatic local address for loopback")
1120 elif net_type == 'gm':
1121 fixme("automatic local address for GM")
1125 def sys_get_branch():
1126 """Returns kernel release"""
1128 fp = open('/proc/sys/kernel/osrelease')
1129 lines = fp.readlines()
1133 version = string.split(l)
1134 a = string.split(version[0], '.')
1135 return a[0] + '.' + a[1]
1140 # XXX: instead of device_list, ask for $name and see what we get
1141 def is_prepared(name):
1142 """Return true if a device exists for the name"""
1143 if config.lctl_dump:
1145 if (config.noexec or config.record) and config.cleanup:
1148 # expect this format:
1149 # 1 UP ldlm ldlm ldlm_UUID 2
1150 out = lctl.device_list()
1152 if name == string.split(s)[3]:
1154 except CommandError, e:
1158 def net_is_prepared():
1159 """If the any device exists, then assume that all networking
1160 has been configured"""
1161 out = lctl.device_list()
1164 def fs_is_mounted(path):
1165 """Return true if path is a mounted lustre filesystem"""
1167 fp = open('/proc/mounts')
1168 lines = fp.readlines()
1172 if a[1] == path and a[2] == 'lustre_lite':
1178 def kmod_find(src_dir, dev_dir, modname):
1179 modbase = src_dir +'/'+ dev_dir +'/'+ modname
1180 for modext in '.ko', '.o':
1181 module = modbase + modext
1183 if os.access(module, os.R_OK):
1189 def kmod_info(modname):
1190 """Returns reference count for passed module name."""
1192 fp = open('/proc/modules')
1193 lines = fp.readlines()
1196 # please forgive my tired fingers for this one
1197 ret = filter(lambda word, mod = modname: word[0] == mod,
1198 map(lambda line: string.split(line), lines))
1202 except Exception, e:
1206 """Presents kernel module"""
1207 def __init__(self, src_dir, dev_dir, name):
1208 self.src_dir = src_dir
1209 self.dev_dir = dev_dir
1212 # FIXME we ignore the failure of loading gss module, because we might
1213 # don't need it at all.
1216 log ('loading module:', self.name, 'srcdir',
1217 self.src_dir, 'devdir', self.dev_dir)
1219 module = kmod_find(self.src_dir, self.dev_dir,
1221 if not module and self.name != 'ptlrpcs_gss':
1222 panic('module not found:', self.name)
1223 (rc, out) = run('/sbin/insmod', module)
1225 if self.name == 'ptlrpcs_gss':
1226 print "Warning: not support gss security!"
1228 raise CommandError('insmod', out, rc)
1230 (rc, out) = run('/sbin/modprobe', self.name)
1232 if self.name == 'ptlrpcs_gss':
1233 print "Warning: not support gss security!"
1235 raise CommandError('modprobe', out, rc)
1239 log('unloading module:', self.name)
1240 (rc, out) = run('/sbin/rmmod', self.name)
1242 log('unable to unload module:', self.name +
1243 "(" + self.refcount() + ")")
1247 """Returns module info if any."""
1248 return kmod_info(self.name)
1251 """Returns 1 if module is loaded. Otherwise 0 is returned."""
1258 """Returns module refcount."""
1265 """Returns 1 if module is used, otherwise 0 is returned."""
1271 if users and users != '(unused)' and users != '-':
1279 """Returns 1 if module is busy, otherwise 0 is returned."""
1280 if self.loaded() and (self.used() or self.refcount() != '0'):
1286 """Manage kernel modules"""
1287 def __init__(self, lustre_dir, portals_dir):
1288 self.lustre_dir = lustre_dir
1289 self.portals_dir = portals_dir
1290 self.kmodule_list = []
1292 def find_module(self, modname):
1293 """Find module by module name"""
1294 for mod in self.kmodule_list:
1295 if mod.name == modname:
1299 def add_portals_module(self, dev_dir, modname):
1300 """Append a module to list of modules to load."""
1302 mod = self.find_module(modname)
1304 mod = kmod(self.portals_dir, dev_dir, modname)
1305 self.kmodule_list.append(mod)
1307 def add_lustre_module(self, dev_dir, modname):
1308 """Append a module to list of modules to load."""
1310 mod = self.find_module(modname)
1312 mod = kmod(self.lustre_dir, dev_dir, modname)
1313 self.kmodule_list.append(mod)
1315 def load_modules(self):
1316 """Load all the modules in the list in the order they appear."""
1317 for mod in self.kmodule_list:
1318 if mod.loaded() and not config.noexec:
1322 def cleanup_modules(self):
1323 """Unload the modules in the list in reverse order."""
1324 rev = self.kmodule_list
1327 if (not mod.loaded() or mod.busy()) and not config.noexec:
1330 if mod.name == 'portals' and config.dump:
1331 lctl.dump(config.dump)
1334 # ============================================================
1335 # Classes to prepare and cleanup the various objects
1338 """ Base class for the rest of the modules. The default cleanup method is
1339 defined here, as well as some utilitiy funcs.
1341 def __init__(self, module_name, db):
1343 self.module_name = module_name
1344 self.name = self.db.getName()
1345 self.uuid = self.db.getUUID()
1349 def info(self, *args):
1350 msg = string.join(map(str,args))
1351 print self.module_name + ":", self.name, self.uuid, msg
1354 """ default cleanup, used for most modules """
1357 lctl.cleanup(self.name, self.uuid, config.force)
1358 except CommandError, e:
1359 log(self.module_name, "cleanup failed: ", self.name)
1363 def add_module(self, manager):
1364 """Adds all needed modules in the order they appear."""
1367 def safe_to_clean(self):
1370 def safe_to_clean_modules(self):
1371 return self.safe_to_clean()
1373 class Network(Module):
1374 def __init__(self,db):
1375 Module.__init__(self, 'NETWORK', db)
1376 self.net_type = self.db.get_val('nettype')
1377 self.nid = self.db.get_val('nid', '*')
1378 self.cluster_id = self.db.get_val('clusterid', "0")
1379 self.port = self.db.get_val_int('port', 0)
1382 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1384 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1385 self.generic_nid = 1
1386 debug("nid:", self.nid)
1388 self.generic_nid = 0
1390 self.nid_uuid = self.nid_to_uuid(self.nid)
1391 self.hostaddr = self.db.get_hostaddr()
1392 if len(self.hostaddr) == 0:
1393 self.hostaddr.append(self.nid)
1394 if '*' in self.hostaddr[0]:
1395 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1396 if not self.hostaddr[0]:
1397 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1398 debug("hostaddr:", self.hostaddr[0])
1400 def add_module(self, manager):
1401 manager.add_portals_module("libcfs", 'libcfs')
1402 manager.add_portals_module("portals", 'portals')
1404 if node_needs_router():
1405 manager.add_portals_module("router", 'kptlrouter')
1406 if self.net_type == 'tcp':
1407 manager.add_portals_module("knals/socknal", 'ksocknal')
1408 if self.net_type == 'elan':
1409 manager.add_portals_module("knals/qswnal", 'kqswnal')
1410 if self.net_type == 'gm':
1411 manager.add_portals_module("knals/gmnal", 'kgmnal')
1412 if self.net_type == 'openib':
1413 manager.add_portals_module("knals/openibnal", 'kopenibnal')
1414 if self.net_type == 'iib':
1415 manager.add_portals_module("knals/iibnal", 'kiibnal')
1416 if self.net_type == 'vib':
1417 self.add_portals_module("knals/vibnal", 'kvibnal')
1418 if self.net_type == 'lo':
1419 manager.add_portals_module("knals/lonal", 'klonal')
1420 if self.net_type == 'ra':
1421 manager.add_portals_module("knals/ranal", 'kranal')
1423 def nid_to_uuid(self, nid):
1424 return "NID_%s_UUID" %(nid,)
1427 if not config.record and net_is_prepared():
1429 self.info(self.net_type, self.nid, self.port)
1430 if not (config.record and self.generic_nid):
1431 lctl.network(self.net_type, self.nid)
1432 if self.net_type == 'tcp':
1434 for hostaddr in self.db.get_hostaddr():
1435 ip = string.split(hostaddr, '/')[0]
1436 if len(string.split(hostaddr, '/')) == 2:
1437 netmask = string.split(hostaddr, '/')[1]
1440 lctl.add_interface(self.net_type, ip, netmask)
1441 if self.net_type == 'elan':
1443 if self.port and node_is_router():
1444 run_one_acceptor(self.port)
1445 self.connect_peer_gateways()
1447 def connect_peer_gateways(self):
1448 for router in self.db.lookup_class('node'):
1449 if router.get_val_int('router', 0):
1450 for netuuid in router.get_networks():
1451 net = self.db.lookup(netuuid)
1453 if (gw.cluster_id == self.cluster_id and
1454 gw.net_type == self.net_type):
1455 if gw.nid != self.nid:
1458 def disconnect_peer_gateways(self):
1459 for router in self.db.lookup_class('node'):
1460 if router.get_val_int('router', 0):
1461 for netuuid in router.get_networks():
1462 net = self.db.lookup(netuuid)
1464 if (gw.cluster_id == self.cluster_id and
1465 gw.net_type == self.net_type):
1466 if gw.nid != self.nid:
1469 except CommandError, e:
1470 print "disconnect failed: ", self.name
1474 def safe_to_clean(self):
1475 return not net_is_prepared()
1478 self.info(self.net_type, self.nid, self.port)
1480 stop_acceptor(self.port)
1481 if node_is_router():
1482 self.disconnect_peer_gateways()
1483 if self.net_type == 'tcp':
1484 for hostaddr in self.db.get_hostaddr():
1485 ip = string.split(hostaddr, '/')[0]
1486 lctl.del_interface(self.net_type, ip)
1488 def correct_level(self, level, op=None):
1491 class RouteTable(Module):
1492 def __init__(self,db):
1493 Module.__init__(self, 'ROUTES', db)
1495 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1497 # only setup connections for tcp, openib, and iib NALs
1499 if not net_type in ('tcp','openib','iib','vib','ra'):
1502 # connect to target if route is to single node and this node is the gw
1503 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1504 if not local_cluster(net_type, tgt_cluster_id):
1505 panic("target", lo, " not on the local cluster")
1506 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1507 # connect to gateway if this node is not the gw
1508 elif (local_cluster(net_type, gw_cluster_id)
1509 and not local_interface(net_type, gw_cluster_id, gw)):
1510 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1515 panic("no server for nid", lo)
1518 return Network(srvdb)
1521 if not config.record and net_is_prepared():
1524 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1525 lctl.add_route(net_type, gw, lo, hi)
1526 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1530 def safe_to_clean(self):
1531 return not net_is_prepared()
1534 if net_is_prepared():
1535 # the network is still being used, don't clean it up
1537 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1538 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1541 lctl.disconnect(srv)
1542 except CommandError, e:
1543 print "disconnect failed: ", self.name
1548 lctl.del_route(net_type, gw, lo, hi)
1549 except CommandError, e:
1550 print "del_route failed: ", self.name
1554 class Management(Module):
1555 def __init__(self, db):
1556 Module.__init__(self, 'MGMT', db)
1558 def add_module(self, manager):
1559 manager.add_lustre_module('lvfs', 'lvfs')
1560 manager.add_lustre_module('obdclass', 'obdclass')
1561 manager.add_lustre_module('ptlrpc', 'ptlrpc')
1562 manager.add_lustre_module('mgmt', 'mgmt_svc')
1565 if not config.record and is_prepared(self.name):
1568 lctl.newdev("mgmt", self.name, self.uuid)
1570 def safe_to_clean(self):
1574 if is_prepared(self.name):
1575 Module.cleanup(self)
1577 def correct_level(self, level, op=None):
1580 # This is only needed to load the modules; the LDLM device
1581 # is now created automatically.
1583 def __init__(self,db):
1584 Module.__init__(self, 'LDLM', db)
1586 def add_module(self, manager):
1587 manager.add_lustre_module('lvfs', 'lvfs')
1588 manager.add_lustre_module('obdclass', 'obdclass')
1589 manager.add_lustre_module('sec', 'ptlrpcs')
1590 manager.add_lustre_module('ptlrpc', 'ptlrpc')
1591 manager.add_lustre_module('sec/gss', 'ptlrpcs_gss')
1599 def correct_level(self, level, op=None):
1603 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1604 Module.__init__(self, 'LOV', db)
1605 if name_override != None:
1606 self.name = "lov_%s" % name_override
1607 self.mds_uuid = self.db.get_first_ref('mds')
1608 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1609 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1610 self.pattern = self.db.get_val_int('stripepattern', 0)
1611 self.devlist = self.db.get_lov_tgts('lov_tgt')
1612 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1615 self.desc_uuid = self.uuid
1616 self.uuid = generate_client_uuid(self.name)
1617 self.fs_name = fs_name
1619 self.config_only = 1
1621 self.config_only = None
1622 mds = self.db.lookup(self.mds_uuid)
1623 self.mds_name = mds.getName()
1624 for (obd_uuid, index, gen, active) in self.devlist:
1627 self.obdlist.append(obd_uuid)
1628 obd = self.db.lookup(obd_uuid)
1629 osc = get_osc(obd, self.uuid, fs_name)
1631 self.osclist.append((osc, index, gen, active))
1633 panic('osc not found:', obd_uuid)
1642 if not config.record and is_prepared(self.name):
1644 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1645 self.stripe_off, self.pattern, self.devlist,
1647 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1648 self.stripe_sz, self.stripe_off, self.pattern)
1649 for (osc, index, gen, active) in self.osclist:
1650 target_uuid = osc.target_uuid
1652 # Only ignore connect failures with --force, which
1653 # isn't implemented here yet.
1655 osc.prepare(ignore_connect_failure=0)
1656 except CommandError, e:
1657 print "Error preparing OSC %s\n" % osc.uuid
1659 lctl.lov_add_osc(self.name, target_uuid, index, gen)
1662 if is_prepared(self.name):
1663 Module.cleanup(self)
1664 for (osc, index, gen, active) in self.osclist:
1665 target_uuid = osc.target_uuid
1667 if self.config_only:
1668 panic("Can't clean up config_only LOV ", self.name)
1670 def add_module(self, manager):
1671 if self.config_only:
1672 panic("Can't load modules for config_only LOV ", self.name)
1673 for (osc, index, gen, active) in self.osclist:
1674 osc.add_module(manager)
1676 manager.add_lustre_module('lov', 'lov')
1678 def correct_level(self, level, op=None):
1682 def __init__(self, db, uuid, fs_name, name_override = None):
1683 Module.__init__(self, 'LMV', db)
1684 if name_override != None:
1685 self.name = "lmv_%s" % name_override
1687 self.devlist = self.db.get_lmv_tgts('lmv_tgt')
1688 if self.devlist == None:
1689 self.devlist = self.db.get_refs('mds')
1692 self.desc_uuid = self.uuid
1694 self.fs_name = fs_name
1695 for mds_uuid in self.devlist:
1696 mds = self.db.lookup(mds_uuid)
1698 panic("MDS not found!")
1699 mdc = MDC(mds, self.uuid, fs_name)
1701 self.mdclist.append(mdc)
1703 panic('mdc not found:', mds_uuid)
1706 if config.record and is_prepared(self.name):
1710 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid)
1711 for mdc in self.mdclist:
1713 # Only ignore connect failures with --force, which
1714 # isn't implemented here yet.
1715 mdc.prepare(ignore_connect_failure=0)
1716 except CommandError, e:
1717 print "Error preparing LMV %s\n" % mdc.uuid
1719 lctl.lmv_add_mdc(self.name, mdc.target_uuid)
1722 if is_prepared(self.name):
1723 Module.cleanup(self)
1724 for mdc in self.mdclist:
1727 def add_module(self, manager):
1728 for mdc in self.mdclist:
1729 mdc.add_module(manager)
1731 manager.add_lustre_module('lmv', 'lmv')
1733 def correct_level(self, level, op=None):
1737 def __init__(self,db):
1738 Module.__init__(self, 'GKD', db)
1739 target_uuid = self.db.get_first_ref('target')
1740 self.target = self.db.lookup(target_uuid)
1741 self.name = self.target.getName()
1743 active_uuid = get_active_target(self.target)
1745 panic("No target device found:", target_uuid)
1746 if active_uuid == self.uuid:
1751 self.uuid = target_uuid
1754 if is_prepared(self.name):
1757 debug(self.uuid, "not active")
1761 lctl.newdev("gks", self.name, self.uuid, setup ="")
1762 if not is_prepared('GKT'):
1763 lctl.newdev("gkt", 'GKT', 'GKT_UUID', setup ="")
1767 debug(self.uuid, "not active")
1770 if is_prepared(self.name):
1772 lctl.cleanup(self.name, self.uuid, config.force,
1774 except CommandError, e:
1775 log(self.module_name, "cleanup failed: ", self.name)
1778 Module.cleanup(self)
1779 if is_prepared('GKT'):
1781 lctl.cleanup("GKT", "GKT_UUID", config.force,
1783 except CommandError, e:
1784 print "cleanup failed: ", self.name
1788 def add_module(self, manager):
1790 manager.add_lustre_module('sec/gks', 'gks')
1791 manager.add_lustre_module('sec/gks', 'gkc')
1793 def correct_level(self, level, op=None):
1796 class CONFDEV(Module):
1797 def __init__(self, db, name, target_uuid, uuid):
1798 Module.__init__(self, 'CONFDEV', db)
1799 self.devpath = self.db.get_val('devpath','')
1800 self.backdevpath = self.db.get_val('devpath','')
1801 self.size = self.db.get_val_int('devsize', 0)
1802 self.journal_size = self.db.get_val_int('journalsize', 0)
1803 self.fstype = self.db.get_val('fstype', '')
1804 self.backfstype = self.db.get_val('backfstype', '')
1805 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1806 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1807 self.target = self.db.lookup(target_uuid)
1808 self.name = "conf_%s" % self.target.getName()
1809 self.client_uuids = self.target.get_refs('client')
1810 self.fs_uuid = self.db.get_first_ref('filesystem')
1811 self.obdtype = self.db.get_val('obdtype', '')
1813 self.mds_sec = self.db.get_val('mds_sec', '')
1814 self.oss_sec = self.db.get_val('oss_sec', '')
1815 self.deny_sec = self.db.get_val('deny_sec', '')
1817 if config.mds_mds_sec:
1818 self.mds_sec = config.mds_mds_sec
1819 if config.mds_oss_sec:
1820 self.oss_sec = config.mds_oss_sec
1821 if config.mds_deny_sec:
1823 self.deny_sec = "%s,%s" %(self.deny_sec, config.mds_deny_sec)
1825 self.deny_sec = config.mds_deny_sec
1827 if self.obdtype == None:
1828 self.obdtype = 'dumb'
1830 self.conf_name = name
1831 self.conf_uuid = uuid
1832 self.realdev = self.devpath
1837 lmv_uuid = self.db.get_first_ref('lmv')
1838 if lmv_uuid != None:
1839 self.lmv = self.db.lookup(lmv_uuid)
1840 if self.lmv != None:
1841 self.client_uuids = self.lmv.get_refs('client')
1843 if self.target.get_class() == 'mds':
1844 if self.target.get_val('failover', 0):
1845 self.failover_mds = 'f'
1847 self.failover_mds = 'n'
1848 self.format = self.db.get_val('autoformat', "no")
1850 self.format = self.db.get_val('autoformat', "yes")
1851 self.osdtype = self.db.get_val('osdtype')
1852 ost = self.db.lookup(target_uuid)
1853 if ost.get_val('failover', 0):
1854 self.failover_ost = 'f'
1856 self.failover_ost = 'n'
1858 self.inode_size = self.get_inode_size()
1860 if self.lmv != None:
1861 client_uuid = self.name + "_lmv_UUID"
1862 self.master = LMV(self.lmv, client_uuid,
1863 self.conf_name, self.conf_name)
1865 def get_inode_size(self):
1866 inode_size = self.db.get_val_int('inodesize', 0)
1867 if inode_size == 0 and self.target.get_class() == 'mds':
1869 # default inode size for case when neither LOV either
1870 # LMV is accessible.
1871 self.inode_size = 256
1873 # find the LOV for this MDS
1874 lovconfig_uuid = self.target.get_first_ref('lovconfig')
1875 if lovconfig_uuid or self.lmv != None:
1876 if self.lmv != None:
1877 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1878 lovconfig = self.lmv.lookup(lovconfig_uuid)
1879 lov_uuid = lovconfig.get_first_ref('lov')
1880 if lov_uuid == None:
1881 panic(self.target.getName() + ": No LOV found for lovconfig ",
1884 lovconfig = self.target.lookup(lovconfig_uuid)
1885 lov_uuid = lovconfig.get_first_ref('lov')
1886 if lov_uuid == None:
1887 panic(self.target.getName() + ": No LOV found for lovconfig ",
1889 if self.lmv != None:
1890 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1891 lovconfig = self.lmv.lookup(lovconfig_uuid)
1892 lov_uuid = lovconfig.get_first_ref('lov')
1894 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, self.name,
1897 # default stripe count controls default inode_size
1898 if lov.stripe_cnt > 0:
1899 stripe_count = lov.stripe_cnt
1901 stripe_count = len(lov.devlist)
1902 if stripe_count > 77:
1904 elif stripe_count > 35:
1906 elif stripe_count > 13:
1908 #elif stripe_count > 3:
1915 def get_mount_options(self, blkdev):
1916 options = def_mount_options(self.fstype,
1917 self.target.get_class())
1919 if config.mountfsoptions:
1921 options = "%s,%s" %(options, config.mountfsoptions)
1923 options = config.mountfsoptions
1924 if self.mountfsoptions:
1925 options = "%s,%s" %(options, self.mountfsoptions)
1927 if self.mountfsoptions:
1929 options = "%s,%s" %(options, self.mountfsoptions)
1931 options = self.mountfsoptions
1933 if self.fstype == 'smfs':
1935 options = "%s,type=%s,dev=%s" %(options, self.backfstype,
1938 options = "type=%s,dev=%s" %(self.backfstype,
1941 if self.target.get_class() == 'mds':
1943 options = "%s,acl,user_xattr,iopen_nopriv" %(options)
1945 options = "iopen_nopriv"
1950 if is_prepared(self.name):
1953 blkdev = block_dev(self.devpath, self.size, self.fstype,
1954 config.reformat, self.format, self.journal_size,
1955 self.inode_size, self.mkfsoptions, self.backfstype,
1958 if self.fstype == 'smfs':
1963 mountfsoptions = self.get_mount_options(blkdev)
1965 self.info(self.target.get_class(), realdev, mountfsoptions,
1966 self.fstype, self.size, self.format)
1968 lctl.newdev("confobd", self.name, self.uuid,
1969 setup ="%s %s %s" %(realdev, self.fstype,
1972 self.mountfsoptions = mountfsoptions
1973 self.realdev = realdev
1975 def add_module(self, manager):
1976 manager.add_lustre_module('obdclass', 'confobd')
1978 # this method checks if current OBD belong to the same FS as passed
1979 # mount point uuid. If not - do not write mountpoint and echo client
1980 # to log, it is not needed, but take damn long time (WB test case)
1982 def belong_to_fs(self, mtpt_uuid):
1983 mtpt = self.db.lookup(mtpt_uuid)
1984 fs_uuid = mtpt.get_first_ref('filesystem')
1986 if not self.fs_uuid or self.fs_uuid == "" or fs_uuid == self.fs_uuid:
1991 def write_conf(self):
1992 if self.target.get_class() == 'ost':
1994 lctl.clear_log(self.name, self.target.getName() + '-conf')
1995 lctl.record(self.name, self.target.getName() + '-conf')
1996 lctl.newdev(self.osdtype, self.conf_name, self.conf_uuid,
1997 setup ="%s %s %s %s" %(self.realdev, self.fstype,
1999 self.mountfsoptions))
2001 lctl.clear_log(self.name, 'OSS-conf')
2002 lctl.record(self.name, 'OSS-conf')
2003 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
2008 if self.target.get_class() == 'mds':
2009 if self.master != None:
2010 master_name = self.master.name
2012 master_name = 'dumb'
2015 lctl.clear_log(self.name, self.target.getName() + '-conf')
2016 lctl.record(self.name, self.target.getName() + '-conf')
2017 lctl.attach("mds", self.conf_name, self.conf_uuid)
2019 lctl.set_security(self.conf_name, "mds_sec", self.mds_sec)
2021 lctl.set_security(self.conf_name, "oss_sec", self.oss_sec)
2023 for flavor in string.split(self.deny_sec, ','):
2024 lctl.set_security(self.conf_name, "deny_sec", flavor)
2025 lctl.newdev("mds", self.conf_name, self.conf_uuid,
2026 setup ="%s %s %s %s %s %s" %(self.realdev, self.fstype,
2027 self.conf_name, self.mountfsoptions,
2028 master_name, self.obdtype))
2032 if not self.client_uuids:
2036 client_uuid = self.conf_name + "_lmv_UUID"
2037 lmv = VMDC(self.lmv, client_uuid, self.conf_name, self.conf_name);
2041 for uuid in self.client_uuids:
2042 log("recording client:", uuid)
2043 client_uuid = generate_client_uuid(self.name)
2044 client = VOSC(self.db.lookup(uuid), client_uuid,
2045 self.target.getName(), self.name)
2047 lctl.clear_log(self.name, self.target.getName())
2048 lctl.record(self.name, self.target.getName())
2050 lctl.mount_option(self.target.getName(), client.get_name(), "", "")
2052 process_updates(self.db, self.name, self.target.getName(), lmv, client)
2054 lctl.clear_log(self.name, self.target.getName() + '-clean')
2055 lctl.record(self.name, self.target.getName() + '-clean')
2057 lctl.del_mount_option(self.target.getName())
2065 # record logs for each client
2067 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
2069 config_options = CONFIG_FILE
2071 for node_db in self.db.lookup_class('node'):
2072 client_name = node_db.getName()
2073 for prof_uuid in node_db.get_refs('profile'):
2074 prof_db = node_db.lookup(prof_uuid)
2075 # refactor this into a funtion to test "clientness"
2077 for ref_class, ref_uuid in prof_db.get_all_refs():
2078 if ref_class in ('mountpoint','echoclient') and self.belong_to_fs(ref_uuid):
2079 debug("recording:", client_name)
2080 log("recording mountpoint:", ref_uuid)
2081 old_noexec = config.noexec
2083 noexec_opt = ('', '-n')
2084 ret, out = run (sys.argv[0],
2085 noexec_opt[old_noexec == 1],
2086 " -v --record --nomod",
2087 "--record_log", client_name,
2088 "--record_device", self.name,
2089 "--node", client_name,
2092 for s in out: log("record> ", string.strip(s))
2093 ret, out = run (sys.argv[0],
2094 noexec_opt[old_noexec == 1],
2095 "--cleanup -v --record --nomod",
2096 "--record_log", client_name + "-clean",
2097 "--record_device", self.name,
2098 "--node", client_name,
2101 for s in out: log("record> ", string.strip(s))
2102 config.noexec = old_noexec
2106 lctl.start(self.name, self.conf_name)
2107 except CommandError, e:
2109 if self.target.get_class() == 'ost':
2110 if not is_prepared('OSS'):
2112 lctl.start(self.name, 'OSS')
2113 except CommandError, e:
2117 if is_prepared(self.name):
2119 lctl.cleanup(self.name, self.uuid, 0, 0)
2120 clean_dev(self.devpath, self.fstype,
2121 self.backfstype, self.backdevpath)
2122 except CommandError, e:
2123 log(self.module_name, "cleanup failed: ", self.name)
2126 Module.cleanup(self)
2128 class MDSDEV(Module):
2129 def __init__(self,db):
2130 Module.__init__(self, 'MDSDEV', db)
2131 self.devpath = self.db.get_val('devpath','')
2132 self.backdevpath = self.db.get_val('devpath','')
2133 self.size = self.db.get_val_int('devsize', 0)
2134 self.journal_size = self.db.get_val_int('journalsize', 0)
2135 self.fstype = self.db.get_val('fstype', '')
2136 self.backfstype = self.db.get_val('backfstype', '')
2137 self.nspath = self.db.get_val('nspath', '')
2138 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
2139 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
2140 self.obdtype = self.db.get_val('obdtype', '')
2141 self.root_squash = self.db.get_val('root_squash', '')
2142 self.no_root_squash = self.db.get_val('no_root_squash', '')
2144 target_uuid = self.db.get_first_ref('target')
2145 self.target = self.db.lookup(target_uuid)
2146 self.name = self.target.getName()
2150 lmv_uuid = self.db.get_first_ref('lmv')
2151 if lmv_uuid != None:
2152 self.lmv = self.db.lookup(lmv_uuid)
2154 active_uuid = get_active_target(self.target)
2156 panic("No target device found:", target_uuid)
2157 if active_uuid == self.uuid:
2159 group = self.target.get_val('group')
2160 if config.group and config.group != group:
2165 self.uuid = target_uuid
2168 if self.lmv != None:
2169 client_uuid = self.name + "_lmv_UUID"
2170 self.master = LMV(self.lmv, client_uuid,
2171 self.name, self.name)
2173 self.confobd = CONFDEV(self.db, self.name,
2174 target_uuid, self.uuid)
2176 def add_module(self, manager):
2178 manager.add_lustre_module('mdc', 'mdc')
2179 manager.add_lustre_module('osc', 'osc')
2180 manager.add_lustre_module('ost', 'ost')
2181 manager.add_lustre_module('lov', 'lov')
2182 manager.add_lustre_module('mds', 'mds')
2184 if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
2185 manager.add_lustre_module(self.fstype, self.fstype)
2188 manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
2190 # if fstype is smfs, then we should also take care about backing
2192 if self.fstype == 'smfs':
2193 manager.add_lustre_module(self.backfstype, self.backfstype)
2194 manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
2196 for option in string.split(self.mountfsoptions, ','):
2197 if option == 'snap':
2198 if not self.fstype == 'smfs':
2199 panic("mountoptions has 'snap', but fstype is not smfs.")
2200 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
2201 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
2204 if self.master != None:
2205 self.master.add_module(manager)
2207 # add CONFDEV modules
2208 if self.confobd != None:
2209 self.confobd.add_module(manager)
2211 def write_conf(self):
2212 if config.write_conf:
2214 debug(self.uuid, "not active")
2216 self.confobd.write_conf()
2218 if is_prepared(self.name):
2221 debug(self.uuid, "not active")
2224 self.confobd.prepare()
2225 self.confobd.write_conf()
2226 self.confobd.cleanup()
2229 if is_prepared(self.name):
2232 debug(self.uuid, "not active")
2236 self.confobd.prepare()
2238 self.confobd.write_conf()
2241 if self.master != None:
2242 self.master.prepare()
2244 if not config.record:
2245 self.confobd.start()
2247 if not is_prepared('MDT'):
2248 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
2250 if development_mode():
2251 # set lsd upcall path
2252 procentry = "/proc/fs/lustre/mds/lsd_upcall"
2253 upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lsd_upcall")
2254 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
2255 print "MDS Warning: failed to set lsd cache upcall"
2257 run("echo ", upcall, " > ", procentry)
2258 # set lacl upcall path
2259 procentry = "/proc/fs/lustre/mds/lacl_upcall"
2260 upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/lacl_upcall")
2261 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
2262 print "MDS Warning: failed to set remote acl upcall"
2264 run("echo ", upcall, " > ", procentry)
2266 if config.root_squash == None:
2267 config.root_squash = self.root_squash
2268 if config.no_root_squash == None:
2269 config.no_root_squash = self.no_root_squash
2270 if config.root_squash:
2271 if config.no_root_squash:
2272 nsnid = config.no_root_squash
2275 lctl.root_squash(self.name, config.root_squash, nsnid)
2277 def msd_remaining(self):
2278 out = lctl.device_list()
2280 if string.split(s)[2] in ('mds',):
2283 def safe_to_clean(self):
2286 def safe_to_clean_modules(self):
2287 return not self.msd_remaining()
2291 debug(self.uuid, "not active")
2294 if is_prepared(self.name):
2296 lctl.cleanup(self.name, self.uuid, config.force,
2298 except CommandError, e:
2299 log(self.module_name, "cleanup failed: ", self.name)
2302 Module.cleanup(self)
2304 if self.master != None:
2305 self.master.cleanup()
2306 if not self.msd_remaining() and is_prepared('MDT'):
2308 lctl.cleanup("MDT", "MDT_UUID", config.force,
2310 except CommandError, e:
2311 print "cleanup failed: ", self.name
2316 self.confobd.cleanup()
2318 def correct_level(self, level, op=None):
2319 #if self.master != None:
2324 def __init__(self, db):
2325 Module.__init__(self, 'OSD', db)
2326 self.osdtype = self.db.get_val('osdtype')
2327 self.devpath = self.db.get_val('devpath', '')
2328 self.backdevpath = self.db.get_val('devpath', '')
2329 self.size = self.db.get_val_int('devsize', 0)
2330 self.journal_size = self.db.get_val_int('journalsize', 0)
2331 self.inode_size = self.db.get_val_int('inodesize', 0)
2332 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
2333 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
2334 self.fstype = self.db.get_val('fstype', '')
2335 self.backfstype = self.db.get_val('backfstype', '')
2336 self.nspath = self.db.get_val('nspath', '')
2337 target_uuid = self.db.get_first_ref('target')
2338 ost = self.db.lookup(target_uuid)
2339 self.name = ost.getName()
2340 self.format = self.db.get_val('autoformat', 'yes')
2341 if ost.get_val('failover', 0):
2342 self.failover_ost = 'f'
2344 self.failover_ost = 'n'
2346 self.deny_sec = self.db.get_val('deny_sec', '')
2348 if config.ost_deny_sec:
2350 self.deny_sec = "%s,%s" %(self.deny_sec, config.ost_deny_sec)
2352 self.deny_sec = config.ost_deny_sec
2354 active_uuid = get_active_target(ost)
2356 panic("No target device found:", target_uuid)
2357 if active_uuid == self.uuid:
2359 group = ost.get_val('group')
2360 if config.group and config.group != group:
2365 self.uuid = target_uuid
2366 self.confobd = CONFDEV(self.db, self.name,
2367 target_uuid, self.uuid)
2369 def add_module(self, manager):
2372 manager.add_lustre_module('ost', 'ost')
2374 if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
2375 manager.add_lustre_module(self.fstype, self.fstype)
2378 manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
2380 if self.fstype == 'smfs':
2381 manager.add_lustre_module(self.backfstype, self.backfstype)
2382 manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
2384 for option in self.mountfsoptions:
2385 if option == 'snap':
2386 if not self.fstype == 'smfs':
2387 panic("mountoptions with snap, but fstype is not smfs\n")
2388 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
2389 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
2391 manager.add_lustre_module(self.osdtype, self.osdtype)
2393 # add CONFDEV modules
2394 if self.confobd != None:
2395 self.confobd.add_module(manager)
2398 if is_prepared(self.name):
2401 debug(self.uuid, "not active")
2406 if self.osdtype == 'obdecho':
2407 self.info(self.osdtype)
2408 lctl.newdev("obdecho", self.name, self.uuid)
2409 if not is_prepared('OSS'):
2410 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup="")
2412 self.confobd.prepare()
2414 self.confobd.write_conf()
2415 if not config.record:
2416 self.confobd.start()
2419 for flavor in string.split(self.deny_sec, ','):
2420 lctl.set_security(self.name, "deny_sec", flavor)
2422 def write_conf(self):
2423 if is_prepared(self.name):
2426 debug(self.uuid, "not active")
2430 if self.osdtype != 'obdecho':
2431 self.confobd.prepare()
2432 self.confobd.write_conf()
2433 if not config.write_conf:
2434 self.confobd.start()
2435 self.confobd.cleanup()
2437 def osd_remaining(self):
2438 out = lctl.device_list()
2440 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2443 def safe_to_clean(self):
2446 def safe_to_clean_modules(self):
2447 return not self.osd_remaining()
2451 debug(self.uuid, "not active")
2454 if is_prepared(self.name):
2457 lctl.cleanup(self.name, self.uuid, config.force,
2459 except CommandError, e:
2460 log(self.module_name, "cleanup failed: ", self.name)
2463 if not self.osd_remaining() and is_prepared('OSS'):
2465 lctl.cleanup("OSS", "OSS_UUID", config.force,
2467 except CommandError, e:
2468 print "cleanup failed: ", self.name
2472 if self.osdtype != 'obdecho':
2474 self.confobd.cleanup()
2476 def correct_level(self, level, op=None):
2479 # Generic client module, used by OSC and MDC
2480 class Client(Module):
2481 def __init__(self, tgtdb, uuid, module, fs_name,
2482 self_name=None, module_dir=None):
2483 self.target_name = tgtdb.getName()
2484 self.target_uuid = tgtdb.getUUID()
2485 self.module_dir = module_dir
2486 self.backup_targets = []
2487 self.module = module
2490 self.module = module
2491 self.module_name = string.upper(module)
2492 self.fs_name = fs_name
2494 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2495 self.target_name, fs_name)
2497 self.name = self_name
2498 if not self.module_dir:
2499 self.module_dir = module
2501 self.tgt_dev_uuid = get_active_target(tgtdb)
2502 if not self.tgt_dev_uuid:
2503 panic("No target device found for target(1):", self.target_name)
2507 self.lookup_server(tgtdb, self.tgt_dev_uuid)
2508 self.lookup_backup_targets()
2510 def add_module(self, manager):
2511 manager.add_lustre_module(self.module_dir, self.module)
2513 def lookup_server(self, db, srv_uuid):
2514 """ Lookup a server's network information """
2515 self._server_nets = get_ost_net(db, srv_uuid)
2516 if len(self._server_nets) == 0:
2517 panic ("Unable to find a server for:", srv_uuid)
2522 def get_servers(self):
2523 return self._server_nets
2525 def lookup_backup_targets(self):
2526 """ Lookup alternative network information """
2527 prof_list = toplustreDB.get_refs('profile')
2528 for prof_uuid in prof_list:
2529 prof_db = toplustreDB.lookup(prof_uuid)
2531 panic("profile:", prof_uuid, "not found.")
2532 for ref_class, ref_uuid in prof_db.get_all_refs():
2533 if ref_class in ('osd', 'mdsdev'):
2534 devdb = toplustreDB.lookup(ref_uuid)
2535 uuid = devdb.get_first_ref('target')
2536 if self.target_uuid == uuid and self.tgt_dev_uuid != ref_uuid:
2537 self.backup_targets.append(ref_uuid)
2539 def prepare(self, ignore_connect_failure = 0):
2540 self.info(self.target_uuid)
2541 if not config.record and is_prepared(self.name):
2544 srv = choose_local_server(self.get_servers())
2548 routes = find_route(self.get_servers())
2549 if len(routes) == 0:
2550 panic ("no route to", self.target_uuid)
2551 for (srv, r) in routes:
2552 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2553 except CommandError, e:
2554 if not ignore_connect_failure:
2558 if self.target_uuid in config.inactive and self.permits_inactive():
2559 debug("%s inactive" % self.target_uuid)
2560 inactive_p = "inactive"
2562 debug("%s active" % self.target_uuid)
2564 lctl.newdev(self.module, self.name, self.uuid,
2565 setup ="%s %s %s" % (self.target_uuid, srv.nid_uuid,
2567 for tgt_dev_uuid in self.backup_targets:
2568 this_nets = get_ost_net(toplustreDB, tgt_dev_uuid)
2569 if len(this_nets) == 0:
2570 panic ("Unable to find a server for:", tgt_dev_uuid)
2571 srv = choose_local_server(this_nets)
2575 routes = find_route(this_nets);
2576 if len(routes) == 0:
2577 panic("no route to", tgt_dev_uuid)
2578 for (srv, r) in routes:
2579 lctl.add_route_host(r[0]. srv.nid_uuid, r[1], r[3])
2581 lctl.add_conn(self.name, srv.nid_uuid);
2584 if is_prepared(self.name):
2585 Module.cleanup(self)
2587 srv = choose_local_server(self.get_servers())
2589 lctl.disconnect(srv)
2591 for (srv, r) in find_route(self.get_servers()):
2592 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2593 except CommandError, e:
2594 log(self.module_name, "cleanup failed: ", self.name)
2598 for tgt_dev_uuid in self.backup_targets:
2599 this_net = get_ost_net(toplustreDB, tgt_dev_uuid)
2600 srv = choose_local_server(this_net)
2602 lctl.disconnect(srv)
2604 for (srv, r) in find_route(this_net):
2605 lctl.del_route_host(r[0]. srv.nid_uuid, r[1], r[3])
2607 def correct_level(self, level, op=None):
2610 def deactivate(self):
2612 lctl.deactivate(self.name)
2613 except CommandError, e:
2614 log(self.module_name, "deactivate failed: ", self.name)
2619 def __init__(self, db, uuid, fs_name):
2620 Client.__init__(self, db, uuid, 'gkc', fs_name)
2622 def permits_inactive(self):
2626 def __init__(self, db, uuid, fs_name):
2627 Client.__init__(self, db, uuid, 'mdc', fs_name)
2629 def permits_inactive(self):
2633 def __init__(self, db, uuid, fs_name):
2634 Client.__init__(self, db, uuid, 'osc', fs_name)
2636 def permits_inactive(self):
2639 class CMOBD(Module):
2640 def __init__(self, db):
2641 Module.__init__(self, 'CMOBD', db)
2642 self.name = self.db.getName();
2643 self.uuid = generate_client_uuid(self.name)
2644 self.master_uuid = self.db.get_first_ref('masterobd')
2645 self.cache_uuid = self.db.get_first_ref('cacheobd')
2647 master_obd = self.db.lookup(self.master_uuid)
2649 panic('master obd not found:', self.master_uuid)
2651 cache_obd = self.db.lookup(self.cache_uuid)
2653 panic('cache obd not found:', self.cache_uuid)
2658 master_class = master_obd.get_class()
2659 cache_class = cache_obd.get_class()
2661 if master_class == 'lov':
2662 client_uuid = "%s_lov_master_UUID" % (self.name)
2663 self.master = LOV(master_obd, client_uuid, self.name,
2664 "master_%s" % (self.name));
2665 elif master_class == 'ost':
2666 client_uuid = "%s_ost_master_UUID" % (self.name)
2667 self.master = get_osc(master_obd, client_uuid, self.master_uuid)
2668 elif master_class == 'mds':
2669 client_uuid = "%s_mds_master_UUID" % (self.name)
2670 self.master = get_mdc(master_obd, self.master_uuid, client_uuid)
2671 elif master_class == 'lmv':
2672 client_uuid = "%s_lmv_master_UUID" % (self.name)
2673 self.master = LMV(master_obd, client_uuid, self.name,
2674 "master_%s" % (self.name));
2676 panic("unknown master obd class '%s'" %(master_class))
2678 if cache_class == 'ost':
2679 self.cache = get_osc(cache_obd, cache_obd.getUUID(),
2681 elif cache_class == 'mds':
2682 self.cache = get_mdc(cache_obd, self.cache_uuid,
2683 cache_obd.getUUID())
2685 panic("invalid cache obd class '%s'" %(cache_class))
2688 if not config.record and is_prepared(self.name):
2690 self.info(self.master_uuid, self.cache_uuid)
2691 self.master.prepare()
2692 lctl.newdev("cmobd", self.name, self.uuid,
2693 setup ="%s %s" %(self.master.uuid,
2702 def get_master_name(self):
2703 return self.master.name
2705 def get_cache_name(self):
2706 return self.cache.name
2709 if is_prepared(self.name):
2710 Module.cleanup(self)
2711 self.master.cleanup()
2713 def add_module(self, manager):
2714 manager.add_lustre_module('smfs', 'smfs')
2715 manager.add_lustre_module('cmobd', 'cmobd')
2716 self.master.add_module(manager)
2718 def correct_level(self, level, op=None):
2722 def __init__(self, db, uuid, name):
2723 Module.__init__(self, 'COBD', db)
2724 self.name = self.db.getName();
2725 self.uuid = generate_client_uuid(self.name)
2726 self.master_uuid = self.db.get_first_ref('masterobd')
2727 self.cache_uuid = self.db.get_first_ref('cacheobd')
2729 master_obd = self.db.lookup(self.master_uuid)
2731 panic('master obd not found:', self.master_uuid)
2733 cache_obd = self.db.lookup(self.cache_uuid)
2735 panic('cache obd not found:', self.cache_uuid)
2740 master_class = master_obd.get_class()
2741 cache_class = cache_obd.get_class()
2743 if master_class == 'ost' or master_class == 'lov':
2744 client_uuid = "%s_lov_master_UUID" % (self.name)
2745 self.master = LOV(master_obd, client_uuid, name,
2746 "master_%s" % (self.name));
2747 elif master_class == 'mds':
2748 self.master = get_mdc(db, self.master_uuid, name)
2749 elif master_class == 'lmv':
2750 client_uuid = "%s_lmv_master_UUID" % (self.name)
2751 self.master = LMV(master_obd, client_uuid, self.name,
2752 "master_%s" % (self.name));
2754 panic("unknown master obd class '%s'" %(master_class))
2756 if cache_class == 'ost' or cache_class == 'lov':
2757 client_uuid = "%s_lov_cache_UUID" % (self.name)
2758 self.cache = LOV(cache_obd, client_uuid, name,
2759 "cache_%s" % (self.name));
2760 elif cache_class == 'mds':
2761 self.cache = get_mdc(db, self.cache_uuid, name)
2762 elif cache_class == 'lmv':
2763 client_uuid = "%s_lmv_cache_UUID" % (self.name)
2764 self.cache = LMV(cache_obd, client_uuid, self.name,
2765 "cache_%s" % (self.name));
2767 panic("unknown cache obd class '%s'" %(cache_class))
2775 def get_master_name(self):
2776 return self.master.name
2778 def get_cache_name(self):
2779 return self.cache.name
2782 if not config.record and is_prepared(self.name):
2784 self.master.prepare()
2785 self.cache.prepare()
2786 self.info(self.master_uuid, self.cache_uuid)
2787 lctl.newdev("cobd", self.name, self.uuid,
2788 setup ="%s %s" %(self.master.name,
2792 if is_prepared(self.name):
2793 Module.cleanup(self)
2794 self.master.cleanup()
2795 self.cache.cleanup()
2797 def add_module(self, manager):
2798 manager.add_lustre_module('cobd', 'cobd')
2799 self.master.add_module(manager)
2801 # virtual interface for OSC and LOV
2803 def __init__(self, db, client_uuid, name, name_override = None):
2804 Module.__init__(self, 'VOSC', db)
2805 if db.get_class() == 'lov':
2806 self.osc = LOV(db, client_uuid, name, name_override)
2808 elif db.get_class() == 'cobd':
2809 self.osc = COBD(db, client_uuid, name)
2812 self.osc = OSC(db, client_uuid, name)
2816 return self.osc.get_uuid()
2819 return self.osc.get_name()
2827 def add_module(self, manager):
2828 self.osc.add_module(manager)
2830 def correct_level(self, level, op=None):
2831 return self.osc.correct_level(level, op)
2833 # virtual interface for MDC and LMV
2835 def __init__(self, db, client_uuid, name, name_override = None):
2836 Module.__init__(self, 'VMDC', db)
2837 if db.get_class() == 'lmv':
2838 self.mdc = LMV(db, client_uuid, name, name_override)
2839 elif db.get_class() == 'cobd':
2840 self.mdc = COBD(db, client_uuid, name)
2842 self.mdc = MDC(db, client_uuid, name)
2845 return self.mdc.uuid
2848 return self.mdc.name
2856 def add_module(self, manager):
2857 self.mdc.add_module(manager)
2859 def correct_level(self, level, op=None):
2860 return self.mdc.correct_level(level, op)
2862 class ECHO_CLIENT(Module):
2863 def __init__(self,db):
2864 Module.__init__(self, 'ECHO_CLIENT', db)
2865 self.obd_uuid = self.db.get_first_ref('obd')
2866 obd = self.db.lookup(self.obd_uuid)
2867 self.uuid = generate_client_uuid(self.name)
2868 self.osc = VOSC(obd, self.uuid, self.name)
2871 if not config.record and is_prepared(self.name):
2874 self.osc.prepare() # XXX This is so cheating. -p
2875 self.info(self.obd_uuid)
2877 lctl.newdev("echo_client", self.name, self.uuid,
2878 setup = self.osc.get_name())
2881 if is_prepared(self.name):
2882 Module.cleanup(self)
2885 def add_module(self, manager):
2886 self.osc.add_module(manager)
2887 manager.add_lustre_module('obdecho', 'obdecho')
2889 def correct_level(self, level, op=None):
2892 def generate_client_uuid(name):
2893 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2895 int(random.random() * 1048576),
2896 int(random.random() * 1048576))
2897 return client_uuid[:36]
2899 class Mountpoint(Module):
2900 def __init__(self, db):
2901 Module.__init__(self, 'MTPT', db)
2902 self.path = self.db.get_val('path')
2903 self.clientoptions = self.db.get_val('clientoptions', '')
2904 self.fs_uuid = self.db.get_first_ref('filesystem')
2905 fs = self.db.lookup(self.fs_uuid)
2906 self.mds_uuid = fs.get_first_ref('lmv')
2907 if not self.mds_uuid:
2908 self.mds_uuid = fs.get_first_ref('mds')
2909 self.obd_uuid = fs.get_first_ref('obd')
2910 self.gks_uuid = fs.get_first_ref('gks')
2911 client_uuid = generate_client_uuid(self.name)
2913 self.oss_sec = self.db.get_val('oss_sec','null')
2914 self.mds_sec = self.db.get_val('mds_sec','null')
2916 self.mds_sec = config.mds_sec
2918 self.oss_sec = config.oss_sec
2920 self.oss_sec = self.db.get_val('oss_sec','null')
2921 self.mds_sec = self.db.get_val('mds_sec','null')
2923 self.mds_sec = config.mds_sec
2925 self.oss_sec = config.oss_sec
2927 ost = self.db.lookup(self.obd_uuid)
2929 panic("no ost: ", self.obd_uuid)
2931 mds = self.db.lookup(self.mds_uuid)
2933 panic("no mds: ", self.mds_uuid)
2935 self.vosc = VOSC(ost, client_uuid, self.name, self.name)
2936 self.vmdc = VMDC(mds, client_uuid, self.name, self.name)
2939 self.gkc = get_gkc(db, client_uuid, self.name, self.gks_uuid)
2942 if not config.record and fs_is_mounted(self.path):
2943 log(self.path, "already mounted.")
2952 self.info(self.path, self.mds_uuid, self.obd_uuid)
2953 if config.record or config.lctl_dump:
2955 lctl.mount_option(local_node_name, self.vosc.get_name(),
2956 self.vmdc.get_name(), self.gkc.get_name())
2958 lctl.mount_option(local_node_name, self.vosc.get_name(),
2959 self.vmdc.get_name(), "")
2962 if config.clientoptions:
2963 if self.clientoptions:
2964 self.clientoptions = self.clientoptions + ',' + config.clientoptions
2966 self.clientoptions = config.clientoptions
2967 if self.clientoptions:
2968 self.clientoptions = ',' + self.clientoptions
2969 # Linux kernel will deal with async and not pass it to ll_fill_super,
2970 # so replace it with Lustre async
2971 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
2974 gkc_name = self.gkc.get_name();
2977 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s,gkc=%s,mds_sec=%s,oss_sec=%s%s %s %s" % \
2978 (self.vosc.get_name(), self.vmdc.get_name(), gkc_name, self.mds_sec,
2979 self.oss_sec, self.clientoptions, config.config, self.path)
2980 log("mount -t lustre_lite -o osc=%s,mdc=%s,gkc=%s,mds_sec=%s,oss_sec=%s%s %s %s" % \
2981 (self.vosc.get_name(), self.vmdc.get_name(), gkc_name, self.mds_sec,
2982 self.oss_sec, self.clientoptions, config.config, self.path))
2983 run("mkdir", self.path)
2988 panic("mount failed:", self.path, ":", string.join(val))
2991 self.info(self.path, self.mds_uuid,self.obd_uuid)
2993 if config.record or config.lctl_dump:
2994 lctl.del_mount_option(local_node_name)
2996 if fs_is_mounted(self.path):
2998 (rc, out) = run("umount", "-f", self.path)
3000 (rc, out) = run("umount", self.path)
3002 raise CommandError('umount', out, rc)
3004 if fs_is_mounted(self.path):
3005 panic("fs is still mounted:", self.path)
3012 def add_module(self, manager):
3013 self.vosc.add_module(manager)
3014 self.vmdc.add_module(manager)
3015 manager.add_lustre_module('llite', 'llite')
3017 manager.add_lustre_module('sec/gks', 'gkc')
3019 def correct_level(self, level, op=None):
3022 # ============================================================
3023 # misc query functions
3025 def get_ost_net(self, osd_uuid):
3029 osd = self.lookup(osd_uuid)
3030 node_uuid = osd.get_first_ref('node')
3031 node = self.lookup(node_uuid)
3033 panic("unable to find node for osd_uuid:", osd_uuid,
3034 " node_ref:", node_uuid)
3035 for net_uuid in node.get_networks():
3036 db = node.lookup(net_uuid)
3037 srv_list.append(Network(db))
3040 # the order of iniitailization is based on level.
3041 def getServiceLevel(self):
3042 type = self.get_class()
3044 if type in ('network',):
3046 elif type in ('routetbl',):
3048 elif type in ('ldlm',):
3050 elif type in ('osd',):
3052 elif type in ('mdsdev',):
3054 elif type in ('lmv', 'cobd',):
3056 elif type in ('gkd',):
3058 elif type in ('cmobd', 'cobd',):
3060 elif type in ('mountpoint', 'echoclient'):
3063 panic("Unknown type: ", type)
3065 if ret < config.minlevel or ret > config.maxlevel:
3070 # return list of services in a profile. list is a list of tuples
3071 # [(level, db_object),]
3072 def getServices(self):
3074 for ref_class, ref_uuid in self.get_all_refs():
3075 servdb = self.lookup(ref_uuid)
3077 level = getServiceLevel(servdb)
3079 list.append((level, servdb))
3081 panic('service not found: ' + ref_uuid)
3087 ############################################################
3089 # FIXME: clean this mess up!
3091 # OSC is no longer in the xml, so we have to fake it.
3092 # this is getting ugly and begging for another refactoring
3093 def get_osc(db, ost_uuid, fs_name):
3094 osc = OSC(db, ost_uuid, fs_name)
3097 def get_mdc(db, mdt_uuid, fs_name):
3098 mdt_db = db.lookup(mdt_uuid);
3100 error("no mdt:", mdt_uuid)
3101 mdc = MDC(mdt_db, mdt_uuid, fs_name)
3104 def get_gkc(db, uuid, fs_name, gks_uuid):
3105 gks_db = db.lookup(gks_uuid);
3107 error("no gks:", gks_uuid)
3108 gkc = GKC(gks_db, uuid, fs_name)
3111 ############################################################
3112 # routing ("rooting")
3114 # list of (nettype, cluster_id, nid)
3117 def find_local_clusters(node_db):
3118 global local_clusters
3119 for netuuid in node_db.get_networks():
3120 net = node_db.lookup(netuuid)
3122 debug("add_local", netuuid)
3123 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
3125 if not acceptors.has_key(srv.port):
3126 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
3128 # This node is a gateway.
3130 def node_is_router():
3133 # If there are any routers found in the config, then this will be true
3134 # and all nodes will load kptlrouter.
3136 def node_needs_router():
3137 return needs_router or is_router
3139 # list of (nettype, gw, tgt_cluster_id, lo, hi)
3140 # Currently, these local routes are only added to kptlrouter route
3141 # table if they are needed to connect to a specific server. This
3142 # should be changed so all available routes are loaded, and the
3143 # ptlrouter can make all the decisions.
3146 def find_local_routes(lustre):
3147 """ Scan the lustre config looking for routers . Build list of
3149 global local_routes, needs_router
3151 list = lustre.lookup_class('node')
3153 if router.get_val_int('router', 0):
3155 for (local_type, local_cluster_id, local_nid) in local_clusters:
3157 for netuuid in router.get_networks():
3158 db = router.lookup(netuuid)
3159 if (local_type == db.get_val('nettype') and
3160 local_cluster_id == db.get_val('clusterid')):
3161 gw = db.get_val('nid')
3164 debug("find_local_routes: gw is", gw)
3165 for route in router.get_local_routes(local_type, gw):
3166 local_routes.append(route)
3167 debug("find_local_routes:", local_routes)
3170 def choose_local_server(srv_list):
3171 for srv in srv_list:
3172 if local_cluster(srv.net_type, srv.cluster_id):
3175 def local_cluster(net_type, cluster_id):
3176 for cluster in local_clusters:
3177 if net_type == cluster[0] and cluster_id == cluster[1]:
3181 def local_interface(net_type, cluster_id, nid):
3182 for cluster in local_clusters:
3183 if (net_type == cluster[0] and cluster_id == cluster[1]
3184 and nid == cluster[2]):
3188 def find_route(srv_list):
3190 frm_type = local_clusters[0][0]
3191 for srv in srv_list:
3192 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
3193 to_type = srv.net_type
3195 cluster_id = srv.cluster_id
3196 debug ('looking for route to', to_type, to)
3197 for r in local_routes:
3198 debug("find_route: ", r)
3199 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
3200 result.append((srv, r))
3203 def get_active_target(db):
3204 target_uuid = db.getUUID()
3205 target_name = db.getName()
3206 node_name = get_select(target_name)
3208 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
3210 tgt_dev_uuid = db.get_first_ref('active')
3213 def get_server_by_nid_uuid(db, nid_uuid):
3214 for n in db.lookup_class("network"):
3216 if net.nid_uuid == nid_uuid:
3220 ############################################################
3224 type = db.get_class()
3225 debug('Service:', type, db.getName(), db.getUUID())
3230 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
3231 elif type == 'network':
3233 elif type == 'routetbl':
3237 elif type == 'cobd':
3238 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
3239 elif type == 'cmobd':
3241 elif type == 'mdsdev':
3243 elif type == 'mountpoint':
3245 elif type == 'echoclient':
3252 panic ("unknown service type:", type)
3256 # Prepare the system to run lustre using a particular profile
3257 # in a the configuration.
3258 # * load & the modules
3259 # * setup networking for the current node
3260 # * make sure partitions are in place and prepared
3261 # * initialize devices with lctl
3262 # Levels is important, and needs to be enforced.
3263 def for_each_profile(db, prof_list, operation):
3264 for prof_uuid in prof_list:
3265 prof_db = db.lookup(prof_uuid)
3267 panic("profile:", prof_uuid, "not found.")
3268 services = getServices(prof_db)
3271 def get_fs_name(db, rec, tag, uuid):
3272 # FIXME: better way to find the mountpoint?
3273 filesystems = db.root_node.getElementsByTagName('filesystem')
3275 for fs in filesystems:
3276 ref = fs.getElementsByTagName(tag)
3277 if ref[0].getAttribute('uuidref') == uuid:
3278 fsuuid = fs.getAttribute('uuid')
3282 panic("malformed xml: uuid '" + uuid + "' referenced in '" + \
3283 rec.nodeName + "' record is not used by any filesystems.")
3285 mtpts = db.root_node.getElementsByTagName('mountpoint')
3288 ref = fs.getElementsByTagName('filesystem_ref')
3289 if ref[0].getAttribute('uuidref') == fsuuid:
3290 fs_name = fs.getAttribute('name')
3294 panic("malformed xml: '" + rec.nodeName + \
3295 "' record references uuid '" + uuid + \
3296 "', which references filesystem uuid '" + fsuuid + \
3297 "', which does not reference a mountpoint.")
3301 def magic_get_osc(db, rec, lov):
3303 lov_uuid = lov.get_uuid()
3304 fs_name = lov.osc.fs_name
3305 lov_name = lov.osc.name
3307 lov_uuid = rec.getAttribute('lov_uuidref')
3308 fs_name = get_fs_name(db, rec, 'obd_ref', lov_uuid)
3309 lov_name = "lov_" + fs_name
3311 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
3313 ost_uuid = rec.getAttribute('ost_uuidref')
3315 if rec.nodeName == 'lov_delete':
3317 # Use the update as a subtree in case a new OST is created with the
3318 # same name as the one that we deleted or other info about the OSS
3319 # has changed since the delete.
3320 # XXX - Not sure if this is the way this is supposed to be done.
3322 info = rec.parentNode.getElementsByTagName('info')
3324 print "delete record missing info !"
3325 tgtdb = Lustre.LustreDB_XML(info[0], info[0])
3329 obd = tgtdb.lookup(ost_uuid)
3331 panic("malformed xml: '" + rec.nodeName + \
3332 "' record references ost uuid '" + ost_uuid + \
3333 "' which cannot be found.")
3334 osc = get_osc(obd, lov_uuid, fs_name)
3336 panic('osc not found:', obd_uuid)
3337 return lov_name, lov_uuid, osc
3339 def magic_get_mdc(db, rec, lmv):
3341 lmv_uuid = lmv.mdc.uuid
3342 fs_name = lmv.mdc.fs_name
3343 lmv_name = lmv.mdc.name
3345 lmv_uuid = rec.getAttribute('lmv_uuidref')
3346 fs_name = get_fs_name(db, rec, 'mds_ref', lmv_uuid)
3347 lmv_name = "lmv_" + fs_name
3349 mdt_uuid = rec.getAttribute('mdt_uuidref')
3351 mds = db.lookup(mdt_uuid)
3354 panic("MDS not found!")
3356 mdc = MDC(mds, lmv_uuid, fs_name)
3358 panic('mdc not found:', mdt_uuid)
3359 return lmv_name, lmv_uuid, mdc
3361 # write logs for update records. sadly, logs of all types -- and updates in
3362 # particular -- are something of an afterthought. lconf needs rewritten with
3363 # these as core concepts. so this is a pretty big hack.
3364 def process_update_record(db, update, lmv, lov):
3365 for rec in update.childNodes:
3366 if rec.nodeType != rec.ELEMENT_NODE:
3369 if rec.nodeName == 'info':
3372 log("found " + rec.nodeName + " record in update version " +
3373 str(update.getAttribute('version')))
3375 if rec.nodeName == 'lmv_add':
3376 lmv_uuid = rec.getAttribute('lmv_uuidref')
3377 mdt_uuid = rec.getAttribute('mdt_uuidref')
3378 if not lmv_uuid or not mdt_uuid:
3379 panic("malformed xml: '" + rec.nodeName + \
3380 "' record requires lmv_uuid and mdt_uuid.")
3382 lmv_name, lmv_uuid, mdc = magic_get_mdc(db, rec, lmv)
3385 # Only ignore connect failures with --force, which
3386 # isn't implemented here yet.
3387 mdc.prepare(ignore_connect_failure=0)
3388 except CommandError, e:
3389 print "Error preparing MDC %s\n" % osc.uuid
3392 lctl.lmv_add_mdc(lmv_name, mdt_uuid)
3395 if rec.nodeName != 'lov_add' and rec.nodeName != 'lov_delete' and \
3396 rec.nodeName != 'lov_deactivate':
3397 panic("unrecognized update record type '" + rec.nodeName + "'.")
3399 lov_uuid = rec.getAttribute('lov_uuidref')
3400 ost_uuid = rec.getAttribute('ost_uuidref')
3401 index = rec.getAttribute('index')
3402 gen = rec.getAttribute('generation')
3404 if not lov_uuid or not ost_uuid or not index or not gen:
3405 panic("malformed xml: '" + rec.nodeName + "' record requires lov_uuid, ost_uuid, index, and generation.")
3407 lov_name, lov_uuid, osc = magic_get_osc(db, rec, lov)
3409 # ------------------------------------------------------------- add
3410 if rec.nodeName == 'lov_add':
3412 # Only ignore connect failures with --force, which
3413 # isn't implemented here yet.
3414 osc.prepare(ignore_connect_failure=0)
3415 except CommandError, e:
3416 print "Error preparing OSC %s\n" % osc.uuid
3419 lctl.lov_add_osc(lov_name, ost_uuid, index, gen)
3421 # ------------------------------------------------------ deactivate
3422 elif rec.nodeName == 'lov_deactivate':
3425 except CommandError, e:
3426 print "Error deactivating OSC %s\n" % osc.uuid
3429 # ---------------------------------------------------------- delete
3430 elif rec.nodeName == 'lov_delete':
3431 lctl.lov_del_osc(lov_name, ost_uuid, index, gen)
3437 except CommandError, e:
3438 print "Error cleaning up OSC %s\n" % osc.uuid
3441 def process_updates(db, log_device, log_name, lmv = None, lov = None):
3442 if not config.write_conf and not config.record:
3447 updates = db.root_node.getElementsByTagName('update')
3449 if not u.childNodes:
3450 log("ignoring empty update record (version " +
3451 str(u.getAttribute('version')) + ")")
3454 version = u.getAttribute('version')
3455 real_name = "%s-%s" % (log_name, version)
3456 lctl.clear_log(log_device, real_name)
3457 lctl.record(log_device, real_name)
3459 process_update_record(db, u, lmv, lov)
3463 def doWriteconf(services):
3465 if s[1].get_class() == 'mdsdev' or s[1].get_class() == 'osd':
3466 n = newService(s[1])
3468 if not config.nosetup:
3471 def doSetup(services):
3476 n = newService(s[1])
3478 slist.append((n.level, n))
3481 nl = n[1].correct_level(n[0])
3482 nlist.append((nl, n[1]))
3486 lctl.clear_log(config.record_device, config.record_log)
3487 lctl.record(config.record_device, config.record_log)
3489 # ugly hack, only need to run lctl commands for --dump
3490 if config.lctl_dump or config.record:
3491 sys_set_timeout(timeout)
3492 sys_set_lustre_upcall(lustre_upcall)
3496 if config.record and n[1].module_name == 'MTPT':
3502 process_updates(n[1].db, config.record_device, config.record_log,
3505 def doLoadModules(services):
3509 # adding all needed modules from all services
3511 n = newService(s[1])
3512 n.add_module(mod_manager)
3514 # loading all registered modules
3515 mod_manager.load_modules()
3517 def doUnloadModules(services):
3521 # adding all needed modules from all services
3523 n = newService(s[1])
3524 if n.safe_to_clean_modules():
3525 n.add_module(mod_manager)
3527 # unloading all registered modules
3528 mod_manager.cleanup_modules()
3530 def doCleanup(services):
3536 n = newService(s[1])
3538 slist.append((n.level, n))
3541 nl = n[1].correct_level(n[0])
3542 nlist.append((nl, n[1]))
3547 lctl.clear_log(config.record_device, config.record_log)
3548 lctl.record(config.record_device, config.record_log)
3551 if n[1].safe_to_clean():
3559 def doHost(lustreDB, hosts):
3560 global is_router, local_node_name, lustre_upcall, timeout
3563 node_db = lustreDB.lookup_name(h, 'node')
3567 panic('No host entry found.')
3569 local_node_name = node_db.get_val('name', 0)
3570 is_router = node_db.get_val_int('router', 0)
3571 lustre_upcall = node_db.get_val('lustreUpcall', '')
3572 portals_upcall = node_db.get_val('portalsUpcall', '')
3573 timeout = node_db.get_val_int('timeout', 0)
3574 ptldebug = node_db.get_val('ptldebug', '')
3575 subsystem = node_db.get_val('subsystem', '')
3577 find_local_clusters(node_db)
3579 find_local_routes(lustreDB)
3581 # Two step process: (1) load modules, (2) setup lustre
3582 # if not cleaning, load modules first.
3583 prof_list = node_db.get_refs('profile')
3585 if config.write_conf:
3586 for_each_profile(node_db, prof_list, doLoadModules)
3588 for_each_profile(node_db, prof_list, doWriteconf)
3589 for_each_profile(node_db, prof_list, doUnloadModules)
3592 elif config.recover:
3593 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3594 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3595 "--client_uuid <UUID> --conn_uuid <UUID>")
3596 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3598 elif config.cleanup:
3600 # the command line can override this value
3602 # ugly hack, only need to run lctl commands for --dump
3603 if config.lctl_dump or config.record:
3604 for_each_profile(node_db, prof_list, doCleanup)
3607 sys_set_timeout(timeout)
3608 sys_set_ptldebug(ptldebug)
3609 sys_set_subsystem(subsystem)
3610 sys_set_lustre_upcall(lustre_upcall)
3611 sys_set_portals_upcall(portals_upcall)
3613 for_each_profile(node_db, prof_list, doCleanup)
3614 for_each_profile(node_db, prof_list, doUnloadModules)
3618 # ugly hack, only need to run lctl commands for --dump
3619 if config.lctl_dump or config.record:
3620 for_each_profile(node_db, prof_list, doSetup)
3624 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3625 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3627 for_each_profile(node_db, prof_list, doLoadModules)
3629 sys_set_debug_path()
3630 sys_set_ptldebug(ptldebug)
3631 sys_set_subsystem(subsystem)
3632 script = config.gdb_script
3633 run(lctl.lctl, ' modules >', script)
3635 log ("The GDB module script is in", script)
3636 # pause, so user has time to break and
3639 sys_set_timeout(timeout)
3640 sys_set_lustre_upcall(lustre_upcall)
3641 sys_set_portals_upcall(portals_upcall)
3643 for_each_profile(node_db, prof_list, doSetup)
3646 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3647 tgt = lustreDB.lookup(tgt_uuid)
3649 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3650 new_uuid = get_active_target(tgt)
3652 raise Lustre.LconfError("doRecovery: no active target found for: " +
3654 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3656 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3658 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3660 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3663 lctl.disconnect(oldnet)
3664 except CommandError, e:
3665 log("recover: disconnect", nid_uuid, "failed: ")
3670 except CommandError, e:
3671 log("recover: connect failed")
3674 lctl.recover(client_uuid, net.nid_uuid)
3677 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3678 base = os.path.dirname(cmd)
3679 if development_mode():
3680 if not config.lustre:
3681 debug('using objdir module paths')
3682 config.lustre = (os.path.join(base, ".."))
3683 # normalize the portals dir, using command line arg if set
3685 portals_dir = config.portals
3686 dir = os.path.join(config.lustre, portals_dir)
3687 config.portals = dir
3688 debug('config.portals', config.portals)
3689 elif config.lustre and config.portals:
3691 # if --lustre and --portals, normalize portals
3692 # can ignore POTRALS_DIR here, since it is probly useless here
3693 config.portals = os.path.join(config.lustre, config.portals)
3694 debug('config.portals B', config.portals)
3696 def sysctl(path, val):
3697 debug("+ sysctl", path, val)
3701 fp = open(os.path.join('/proc/sys', path), 'w')
3707 def sys_set_debug_path():
3708 sysctl('portals/debug_path', config.debug_path)
3710 def sys_set_lustre_upcall(upcall):
3711 # the command overrides the value in the node config
3712 if config.lustre_upcall:
3713 upcall = config.lustre_upcall
3715 upcall = config.upcall
3717 lctl.set_lustre_upcall(upcall)
3719 def sys_set_portals_upcall(upcall):
3720 # the command overrides the value in the node config
3721 if config.portals_upcall:
3722 upcall = config.portals_upcall
3724 upcall = config.upcall
3726 sysctl('portals/upcall', upcall)
3728 def sys_set_timeout(timeout):
3729 # the command overrides the value in the node config
3730 if config.timeout and config.timeout > 0:
3731 timeout = config.timeout
3732 if timeout != None and timeout > 0:
3733 lctl.set_timeout(timeout)
3735 def sys_tweak_socknal ():
3736 # reserve at least 8MB, or we run out of RAM in skb_alloc under read
3737 if sys_get_branch() == '2.6':
3738 fp = open('/proc/meminfo')
3739 lines = fp.readlines()
3744 if a[0] == 'MemTotal:':
3746 debug("memtotal" + memtotal)
3747 if int(memtotal) < 262144:
3748 minfree = int(memtotal) / 16
3751 debug("+ minfree ", minfree)
3752 sysctl("vm/min_free_kbytes", minfree)
3753 if config.single_socket:
3754 sysctl("socknal/typed", 0)
3756 def sys_optimize_elan ():
3757 procfiles = ["/proc/elan/config/eventint_punt_loops",
3758 "/proc/qsnet/elan3/config/eventint_punt_loops",
3759 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3761 if os.access(p, os.W_OK):
3762 run ("echo 1 > " + p)
3764 def sys_set_ptldebug(ptldebug):
3766 ptldebug = config.ptldebug
3769 val = eval(ptldebug, ptldebug_names)
3770 val = "0x%x" % (val & 0xffffffffL)
3771 sysctl('portals/debug', val)
3772 except NameError, e:
3775 def sys_set_subsystem(subsystem):
3776 if config.subsystem:
3777 subsystem = config.subsystem
3780 val = eval(subsystem, subsystem_names)
3781 val = "0x%x" % (val & 0xffffffffL)
3782 sysctl('portals/subsystem_debug', val)
3783 except NameError, e:
3786 def sys_set_netmem_max(path, max):
3787 debug("setting", path, "to at least", max)
3795 fp = open(path, 'w')
3796 fp.write('%d\n' %(max))
3799 def sys_make_devices():
3800 if not os.access('/dev/portals', os.R_OK):
3801 run('mknod /dev/portals c 10 240')
3802 if not os.access('/dev/obd', os.R_OK):
3803 run('mknod /dev/obd c 10 241')
3805 # Add dir to the global PATH, if not already there.
3806 def add_to_path(new_dir):
3807 syspath = string.split(os.environ['PATH'], ':')
3808 if new_dir in syspath:
3810 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3812 def default_debug_path():
3813 path = '/tmp/lustre-log'
3814 if os.path.isdir('/r'):
3819 def default_gdb_script():
3820 script = '/tmp/ogdb'
3821 if os.path.isdir('/r'):
3822 return '/r' + script
3826 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3827 # ensure basic elements are in the system path
3828 def sanitise_path():
3829 for dir in DEFAULT_PATH:
3832 # global hack for the --select handling
3834 def init_select(args):
3835 # args = [service=nodeA,service2=nodeB service3=nodeC]
3838 list = string.split(arg, ',')
3840 srv, node = string.split(entry, '=')
3841 tgt_select[srv] = node
3843 def get_select(srv):
3844 if tgt_select.has_key(srv):
3845 return tgt_select[srv]
3849 FLAG = Lustre.Options.FLAG
3850 PARAM = Lustre.Options.PARAM
3851 INTPARAM = Lustre.Options.INTPARAM
3852 PARAMLIST = Lustre.Options.PARAMLIST
3854 ('verbose,v', "Print system commands as they are run"),
3855 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3856 ('config', "Cluster config name used for LDAP query", PARAM),
3857 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3858 ('node', "Load config for <nodename>", PARAM),
3859 ('sec',"security flavor <null|krb5i|krb5p> between this client with mds", PARAM),
3860 ('mds_sec',"security flavor <null|krb5i|krb5p> between this client with mds", PARAM),
3861 ('oss_sec',"security flavor <null|krb5i|krb5p> between this client with ost", PARAM),
3862 ('mds_mds_sec',"security flavor <null|krb5i|krb5p> between this mds with other mds", PARAM),
3863 ('mds_oss_sec',"security flavor <null|krb5i|krb5p> between this mds with ost", PARAM),
3864 ('mds_deny_sec', "security flavor <null|krb5i|krb5p> denied by this mds", PARAM),
3865 ('ost_deny_sec', "security flavor <null|krb5i|krb5p> denied by this ost", PARAM),
3866 ('cleanup,d', "Cleans up config. (Shutdown)"),
3867 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3869 ('single_socket', "socknal option: only use one socket instead of bundle",
3871 ('failover',"""Used to shut down without saving state.
3872 This will allow this node to "give up" a service to a
3873 another node for failover purposes. This will not
3874 be a clean shutdown.""",
3876 ('gdb', """Prints message after creating gdb module script
3877 and sleeps for 5 seconds."""),
3878 ('noexec,n', """Prints the commands and steps that will be run for a
3879 config without executing them. This can used to check if a
3880 config file is doing what it should be doing"""),
3881 ('nomod', "Skip load/unload module step."),
3882 ('nosetup', "Skip device setup/cleanup step."),
3883 ('reformat', "Reformat all devices (without question)"),
3884 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3885 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3886 ('clientoptions', "Additional options for Lustre", PARAM),
3887 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3889 ('write_conf', "Save all the client config information on mds."),
3890 ('record', "Write config information on mds."),
3891 ('record_log', "Name of config record log.", PARAM),
3892 ('record_device', "MDS device name that will record the config commands",
3894 ('root_squash', "MDS squash root to appointed uid",
3896 ('no_root_squash', "Don't squash root for appointed nid",
3898 ('minlevel', "Minimum level of services to configure/cleanup",
3900 ('maxlevel', """Maximum level of services to configure/cleanup
3901 Levels are aproximatly like:
3906 70 - mountpoint, echo_client, osc, mdc, lov""",
3908 ('lustre', """Base directory of lustre sources. This parameter will
3909 cause lconf to load modules from a source tree.""", PARAM),
3910 ('portals', """Portals source directory. If this is a relative path,
3911 then it is assumed to be relative to lustre. """, PARAM),
3912 ('timeout', "Set recovery timeout", INTPARAM),
3913 ('upcall', "Set both portals and lustre upcall script", PARAM),
3914 ('lustre_upcall', "Set lustre upcall script", PARAM),
3915 ('portals_upcall', "Set portals upcall script", PARAM),
3916 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3917 ('ptldebug', "Set the portals debug level", PARAM),
3918 ('subsystem', "Set the portals debug subsystem", PARAM),
3919 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3920 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3921 # Client recovery options
3922 ('recover', "Recover a device"),
3923 ('group', "The group of devices to configure or cleanup", PARAM),
3924 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3925 ('client_uuid', "The failed client (required for recovery)", PARAM),
3926 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3928 ('inactive', """The name of an inactive service, to be ignored during
3929 mounting (currently OST-only). Can be repeated.""",
3934 global lctl, config, toplustreDB, CONFIG_FILE, mod_manager
3936 # in the upcall this is set to SIG_IGN
3937 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3939 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3941 config, args = cl.parse(sys.argv[1:])
3942 except Lustre.OptionError, e:
3946 setupModulePath(sys.argv[0])
3948 host = socket.gethostname()
3950 # the PRNG is normally seeded with time(), which is not so good for starting
3951 # time-synchronized clusters
3952 input = open('/dev/urandom', 'r')
3954 print 'Unable to open /dev/urandom!'
3956 seed = input.read(32)
3962 init_select(config.select)
3965 # allow config to be fetched via HTTP, but only with python2
3966 if sys.version[0] != '1' and args[0].startswith('http://'):
3969 config_file = urllib2.urlopen(args[0])
3970 except (urllib2.URLError, socket.error), err:
3971 if hasattr(err, 'args'):
3973 print "Could not access '%s': %s" %(args[0], err)
3975 elif not os.access(args[0], os.R_OK):
3976 print 'File not found or readable:', args[0]
3980 config_file = open(args[0], 'r')
3982 dom = xml.dom.minidom.parse(config_file)
3984 panic("%s does not appear to be a config file." % (args[0]))
3985 sys.exit(1) # make sure to die here, even in debug mode.
3987 CONFIG_FILE = args[0]
3988 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3989 if not config.config:
3990 config.config = os.path.basename(args[0])# use full path?
3991 if config.config[-4:] == '.xml':
3992 config.config = config.config[:-4]
3993 elif config.ldapurl:
3994 if not config.config:
3995 panic("--ldapurl requires --config name")
3996 dn = "config=%s,fs=lustre" % (config.config)
3997 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3998 elif config.ptldebug or config.subsystem:
3999 sys_set_ptldebug(None)
4000 sys_set_subsystem(None)
4003 print 'Missing config file or ldap URL.'
4004 print 'see lconf --help for command summary'
4007 toplustreDB = lustreDB
4009 ver = lustreDB.get_version()
4011 panic("No version found in config data, please recreate.")
4012 if ver != Lustre.CONFIG_VERSION:
4013 panic("Config version", ver, "does not match lconf version",
4014 Lustre.CONFIG_VERSION)
4018 node_list.append(config.node)
4021 node_list.append(host)
4022 node_list.append('localhost')
4024 debug("configuring for host: ", node_list)
4027 config.debug_path = config.debug_path + '-' + host
4028 config.gdb_script = config.gdb_script + '-' + host
4030 lctl = LCTLInterface('lctl')
4032 if config.lctl_dump:
4033 lctl.use_save_file(config.lctl_dump)
4036 if not (config.record_device and config.record_log):
4037 panic("When recording, both --record_log and --record_device must be specified.")
4039 # init module manager
4040 mod_manager = kmod_manager(config.lustre, config.portals)
4042 doHost(lustreDB, node_list)
4046 if __name__ == "__main__":
4049 except Lustre.LconfError, e:
4051 # traceback.print_exc(file=sys.stdout)
4053 except CommandError, e:
4057 if first_cleanup_error:
4058 sys.exit(first_cleanup_error)