3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
96 "undefined" : (1 << 0),
106 "portals" : (1 << 10),
108 "pinger" : (1 << 12),
109 "filter" : (1 << 13),
114 "ptlrouter" : (1 << 18),
118 "confobd" : (1 << 22),
124 first_cleanup_error = 0
125 def cleanup_error(rc):
126 global first_cleanup_error
127 if not first_cleanup_error:
128 first_cleanup_error = rc
130 # ============================================================
131 # debugging and error funcs
133 def fixme(msg = "this feature"):
134 raise Lustre.LconfError, msg + ' not implemented yet.'
137 msg = string.join(map(str,args))
138 if not config.noexec:
139 raise Lustre.LconfError(msg)
144 msg = string.join(map(str,args))
149 print string.strip(s)
153 msg = string.join(map(str,args))
156 # ack, python's builtin int() does not support '0x123' syntax.
157 # eval can do it, although what a hack!
161 return eval(s, {}, {})
164 except SyntaxError, e:
165 raise ValueError("not a number")
167 raise ValueError("not a number")
169 # ============================================================
170 # locally defined exceptions
171 class CommandError (exceptions.Exception):
172 def __init__(self, cmd_name, cmd_err, rc=None):
173 self.cmd_name = cmd_name
174 self.cmd_err = cmd_err
179 if type(self.cmd_err) == types.StringType:
181 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
183 print "! %s: %s" % (self.cmd_name, self.cmd_err)
184 elif type(self.cmd_err) == types.ListType:
186 print "! %s (error %d):" % (self.cmd_name, self.rc)
188 print "! %s:" % (self.cmd_name)
189 for s in self.cmd_err:
190 print "> %s" %(string.strip(s))
195 # ============================================================
196 # handle daemons, like the acceptor
198 """ Manage starting and stopping a daemon. Assumes daemon manages
199 it's own pid file. """
201 def __init__(self, cmd):
207 log(self.command, "already running.")
209 self.path = find_prog(self.command)
211 panic(self.command, "not found.")
212 ret, out = runcmd(self.path +' '+ self.command_line())
214 raise CommandError(self.path, out, ret)
218 pid = self.read_pidfile()
221 log ("killing process", pid)
224 log("was unable to find pid of " + self.command)
225 #time.sleep(1) # let daemon die
227 log("unable to kill", self.command, e)
229 log("unable to kill", self.command)
232 pid = self.read_pidfile()
238 log("was unable to find pid of " + self.command)
245 def read_pidfile(self):
247 fp = open(self.pidfile(), 'r')
257 def clean_pidfile(self):
258 """ Remove a stale pidfile """
259 log("removing stale pidfile:", self.pidfile())
261 os.unlink(self.pidfile())
263 log(self.pidfile(), e)
265 class AcceptorHandler(DaemonHandler):
266 def __init__(self, port, net_type):
267 DaemonHandler.__init__(self, "acceptor")
272 return "/var/run/%s-%d.pid" % (self.command, self.port)
274 def command_line(self):
275 return string.join(map(str,(self.flags, self.port)))
279 # start the acceptors
281 if config.lctl_dump or config.record:
283 for port in acceptors.keys():
284 daemon = acceptors[port]
285 if not daemon.running():
288 def run_one_acceptor(port):
289 if config.lctl_dump or config.record:
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
293 if not daemon.running():
296 panic("run_one_acceptor: No acceptor defined for port:", port)
298 def stop_acceptor(port):
299 if acceptors.has_key(port):
300 daemon = acceptors[port]
305 # ============================================================
306 # handle lctl interface
309 Manage communication with lctl
312 def __init__(self, cmd):
314 Initialize close by finding the lctl binary.
316 self.lctl = find_prog(cmd)
318 self.record_device = ''
321 debug('! lctl not found')
324 raise CommandError('lctl', "unable to find lctl binary.")
326 def use_save_file(self, file):
327 self.save_file = file
329 def record(self, dev_name, logname):
330 log("Recording log", logname, "on", dev_name)
331 self.record_device = dev_name
332 self.record_log = logname
334 def end_record(self):
335 log("End recording log", self.record_log, "on", self.record_device)
336 self.record_device = None
337 self.record_log = None
339 def set_nonblock(self, fd):
340 fl = fcntl.fcntl(fd, F_GETFL)
341 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
346 the cmds are written to stdin of lctl
347 lctl doesn't return errors when run in script mode, so
349 should modify command line to accept multiple commands, or
350 create complex command line options
354 cmds = '\n dump ' + self.save_file + '\n' + cmds
355 elif self.record_device:
359 %s""" % (self.record_device, self.record_log, cmds)
361 debug("+", cmd_line, cmds)
362 if config.noexec: return (0, [])
364 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
365 child.tochild.write(cmds + "\n")
366 child.tochild.close()
367 # print "LCTL:", cmds
369 # From "Python Cookbook" from O'Reilly
370 outfile = child.fromchild
371 outfd = outfile.fileno()
372 self.set_nonblock(outfd)
373 errfile = child.childerr
374 errfd = errfile.fileno()
375 self.set_nonblock(errfd)
377 outdata = errdata = ''
380 ready = select.select([outfd,errfd],[],[]) # Wait for input
381 if outfd in ready[0]:
382 outchunk = outfile.read()
383 if outchunk == '': outeof = 1
384 outdata = outdata + outchunk
385 if errfd in ready[0]:
386 errchunk = errfile.read()
387 if errchunk == '': erreof = 1
388 errdata = errdata + errchunk
389 if outeof and erreof: break
390 # end of "borrowed" code
393 if os.WIFEXITED(ret):
394 rc = os.WEXITSTATUS(ret)
397 if rc or len(errdata):
398 raise CommandError(self.lctl, errdata, rc)
401 def runcmd(self, *args):
403 run lctl using the command line
405 cmd = string.join(map(str,args))
406 debug("+", self.lctl, cmd)
407 rc, out = run(self.lctl, cmd)
409 raise CommandError(self.lctl, out, rc)
412 def clear_log(self, dev, log):
413 """ clear an existing log """
418 quit """ % (dev, log)
421 def root_squash(self, name, uid, nid):
425 quit""" % (name, uid, nid)
428 def network(self, net, nid):
433 quit """ % (net, nid)
437 def add_interface(self, net, ip, netmask = ""):
438 """ add an interface """
442 quit """ % (net, ip, netmask)
445 # delete an interface
446 def del_interface(self, net, ip):
447 """ delete an interface """
454 # create a new connection
455 def add_uuid(self, net_type, uuid, nid):
456 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
459 def add_peer(self, net_type, nid, hostaddr, port):
460 if net_type in ('tcp','ra') and not config.lctl_dump:
465 nid, hostaddr, port )
467 elif net_type in ('openib','iib',) and not config.lctl_dump:
475 def connect(self, srv):
476 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
477 if srv.net_type in ('tcp','openib','iib','ra') and not config.lctl_dump:
479 hostaddr = string.split(srv.hostaddr[0], '/')[0]
480 self.add_peer(srv.net_type, srv.nid, hostaddr, srv.port)
483 def recover(self, dev_name, new_conn):
486 recover %s""" %(dev_name, new_conn)
489 # add a route to a range
490 def add_route(self, net, gw, lo, hi):
498 except CommandError, e:
502 def del_route(self, net, gw, lo, hi):
507 quit """ % (net, gw, lo, hi)
510 # add a route to a host
511 def add_route_host(self, net, uuid, gw, tgt):
512 self.add_uuid(net, uuid, tgt)
520 except CommandError, e:
524 # add a route to a range
525 def del_route_host(self, net, uuid, gw, tgt):
531 quit """ % (net, gw, tgt)
535 def del_peer(self, net_type, nid, hostaddr):
536 if net_type in ('tcp',) and not config.lctl_dump:
540 del_peer %s %s single_share
544 elif net_type in ('openib','iib','ra') and not config.lctl_dump:
548 del_peer %s single_share
553 # disconnect one connection
554 def disconnect(self, srv):
555 self.del_uuid(srv.nid_uuid)
556 if srv.net_type in ('tcp','openib','iib','ra') and not config.lctl_dump:
558 hostaddr = string.split(srv.hostaddr[0], '/')[0]
559 self.del_peer(srv.net_type, srv.nid, hostaddr)
561 def del_uuid(self, uuid):
569 def disconnectAll(self, net):
577 def attach(self, type, name, uuid):
580 quit""" % (type, name, uuid)
583 def setup(self, name, setup = ""):
587 quit""" % (name, setup)
590 def add_conn(self, name, conn_uuid):
594 quit""" % (name, conn_uuid)
598 # create a new device with lctl
599 def newdev(self, type, name, uuid, setup = ""):
600 self.attach(type, name, uuid);
602 self.setup(name, setup)
603 except CommandError, e:
604 self.cleanup(name, uuid, 0)
609 def cleanup(self, name, uuid, force, failover = 0):
610 if failover: force = 1
616 quit""" % (name, ('', 'force')[force],
617 ('', 'failover')[failover])
621 def lov_setup(self, name, uuid, desc_uuid, stripe_cnt,
622 stripe_sz, stripe_off, pattern, devlist = None):
625 lov_setup %s %d %d %d %s %s
626 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
630 # add an OBD to a LOV
631 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
633 lov_modify_tgts add %s %s %s %s
634 quit""" % (name, obd_uuid, index, gen)
638 def lmv_setup(self, name, uuid, desc_uuid, devlist):
642 quit""" % (name, uuid, desc_uuid, devlist)
645 # delete an OBD from a LOV
646 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
648 lov_modify_tgts del %s %s %s %s
649 quit""" % (name, obd_uuid, index, gen)
653 def deactivate(self, name):
661 def dump(self, dump_file):
664 quit""" % (dump_file)
667 # get list of devices
668 def device_list(self):
669 devices = '/proc/fs/lustre/devices'
671 if os.access(devices, os.R_OK):
673 fp = open(devices, 'r')
681 def lustre_version(self):
682 rc, out = self.runcmd('version')
686 def mount_option(self, profile, osc, mdc):
688 mount_option %s %s %s
689 quit""" % (profile, osc, mdc)
692 # delete mount options
693 def del_mount_option(self, profile):
699 def set_timeout(self, timeout):
705 def set_lustre_upcall(self, upcall):
710 # ============================================================
711 # Various system-level functions
712 # (ideally moved to their own module)
714 # Run a command and return the output and status.
715 # stderr is sent to /dev/null, could use popen3 to
716 # save it if necessary
719 if config.noexec: return (0, [])
720 f = os.popen(cmd + ' 2>&1')
730 cmd = string.join(map(str,args))
733 # Run a command in the background.
734 def run_daemon(*args):
735 cmd = string.join(map(str,args))
737 if config.noexec: return 0
738 f = os.popen(cmd + ' 2>&1')
746 # Determine full path to use for an external command
747 # searches dirname(argv[0]) first, then PATH
749 syspath = string.split(os.environ['PATH'], ':')
750 cmdpath = os.path.dirname(sys.argv[0])
751 syspath.insert(0, cmdpath);
753 syspath.insert(0, os.path.join(config.portals, 'utils/'))
755 prog = os.path.join(d,cmd)
756 if os.access(prog, os.X_OK):
760 # Recursively look for file starting at base dir
761 def do_find_file(base, mod):
762 fullname = os.path.join(base, mod)
763 if os.access(fullname, os.R_OK):
765 for d in os.listdir(base):
766 dir = os.path.join(base,d)
767 if os.path.isdir(dir):
768 module = do_find_file(dir, mod)
772 # is the path a block device?
779 return stat.S_ISBLK(s[stat.ST_MODE])
781 # find the journal device from mkfs options
787 while i < len(x) - 1:
788 if x[i] == '-J' and x[i+1].startswith('device='):
794 # build fs according to type
796 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
802 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
804 # devsize is in 1k, and fs block count is in 4k
805 block_cnt = devsize/4
807 if fstype in ('ext3', 'extN', 'ldiskfs'):
808 # ext3 journal size is in megabytes
809 # but don't set jsize if mkfsoptions indicates a separate journal device
810 if jsize == 0 and jdev(mkfsoptions) == '':
812 if not is_block(dev):
813 ret, out = runcmd("ls -l %s" %dev)
814 devsize = int(string.split(out[0])[4]) / 1024
816 # sfdisk works for symlink, hardlink, and realdev
817 ret, out = runcmd("sfdisk -s %s" %dev)
819 devsize = int(out[0])
821 # sfdisk -s will fail for too large block device,
822 # then, read the size of partition from /proc/partitions
824 # get the realpath of the device
825 # it may be the real device, such as /dev/hda7
826 # or the hardlink created via mknod for a device
827 if 'realpath' in dir(os.path):
828 real_dev = os.path.realpath(dev)
832 while os.path.islink(real_dev) and (link_count < 20):
833 link_count = link_count + 1
834 dev_link = os.readlink(real_dev)
835 if os.path.isabs(dev_link):
838 real_dev = os.path.join(os.path.dirname(real_dev), dev_link)
840 panic("Entountered too many symbolic links resolving block device:", dev)
842 # get the major and minor number of the realpath via ls
843 # it seems python(os.stat) does not return
844 # the st_rdev member of the stat structure
845 ret, out = runcmd("ls -l %s" %real_dev)
846 major = string.split(string.split(out[0])[4], ",")[0]
847 minor = string.split(out[0])[5]
849 # get the devsize from /proc/partitions with the major and minor number
850 ret, out = runcmd("cat /proc/partitions")
853 if string.split(line)[0] == major and string.split(line)[1] == minor:
854 devsize = int(string.split(line)[2])
857 if devsize > 1024 * 1024:
858 jsize = ((devsize / 102400) * 4)
861 if jsize: jopt = "-J size=%d" %(jsize,)
862 if isize: iopt = "-I %d" %(isize,)
863 mkfs = 'mkfs.ext2 -j -b 4096 '
864 if not isblock or config.force:
866 if jdev(mkfsoptions) != '':
867 jmkfs = 'mkfs.ext2 -b 4096 -O journal_dev '
869 jmkfs = jmkfs + '-F '
870 jmkfs = jmkfs + jdev(mkfsoptions)
871 (ret, out) = run (jmkfs)
873 panic("Unable format journal device:", jdev(mkfsoptions), string.join(out))
874 elif fstype == 'reiserfs':
875 # reiserfs journal size is in blocks
876 if jsize: jopt = "--journal_size %d" %(jsize,)
877 mkfs = 'mkreiserfs -ff'
879 panic('unsupported fs type: ', fstype)
881 if config.mkfsoptions != None:
882 mkfs = mkfs + ' ' + config.mkfsoptions
883 if mkfsoptions != None:
884 mkfs = mkfs + ' ' + mkfsoptions
885 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
887 panic("Unable to build fs:", dev, string.join(out))
888 # enable hash tree indexing on fsswe
889 if fstype in ('ext3', 'extN', 'ldiskfs'):
890 htree = 'echo "feature FEATURE_C5" | debugfs -w'
891 (ret, out) = run (htree, dev)
893 panic("Unable to enable htree:", dev)
895 # some systems use /dev/loopN, some /dev/loop/N
899 if not os.access(loop + str(0), os.R_OK):
901 if not os.access(loop + str(0), os.R_OK):
902 panic ("can't access loop devices")
905 # find loop device assigned to the file
906 def find_assigned_loop(file):
908 for n in xrange(0, MAX_LOOP_DEVICES):
910 if os.access(dev, os.R_OK):
911 (stat, out) = run('losetup', dev)
912 if out and stat == 0:
913 m = re.search(r'\((.*)\)', out[0])
914 if m and file == m.group(1):
920 # create file if necessary and assign the first free loop device
921 def init_loop(file, size, fstype, journal_size, inode_size,
922 mkfsoptions, reformat, autoformat, backfstype, backfile):
925 realfstype = backfstype
926 if is_block(backfile):
927 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
928 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
934 dev = find_assigned_loop(realfile)
936 print 'WARNING: file ', realfile, 'already mapped to', dev
939 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
941 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
942 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
944 panic("Unable to create backing store:", realfile)
946 mkfs(realfile, size, realfstype, journal_size, inode_size,
947 mkfsoptions, isblock=0)
950 # find next free loop
951 for n in xrange(0, MAX_LOOP_DEVICES):
953 if os.access(dev, os.R_OK):
954 (stat, out) = run('losetup', dev)
956 print "attach " + realfile + " <-> " + dev
957 run('losetup', dev, realfile)
960 print "out of loop devices"
962 print "out of loop devices"
965 # undo loop assignment
966 def clean_loop(dev, fstype, backfstype, backdev):
971 if not is_block(realfile):
972 dev = find_assigned_loop(realfile)
974 print "detach " + dev + " <-> " + realfile
975 ret, out = run('losetup -d', dev)
977 log('unable to clean loop device:', dev, 'for file:', realfile)
980 # finilizes passed device
981 def clean_dev(dev, fstype, backfstype, backdev):
982 if fstype == 'smfs' or not is_block(dev):
983 clean_loop(dev, fstype, backfstype, backdev)
985 # determine if dev is formatted as a <fstype> filesystem
986 def need_format(fstype, dev):
987 # FIXME don't know how to implement this
990 # initialize a block device if needed
991 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
992 inode_size, mkfsoptions, backfstype, backdev):
996 if fstype == 'smfs' or not is_block(dev):
997 dev = init_loop(dev, size, fstype, journal_size, inode_size,
998 mkfsoptions, reformat, autoformat, backfstype, backdev)
999 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
1000 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
1003 # panic("device:", dev,
1004 # "not prepared, and autoformat is not set.\n",
1005 # "Rerun with --reformat option to format ALL filesystems")
1010 """lookup IP address for an interface"""
1011 rc, out = run("/sbin/ifconfig", iface)
1014 addr = string.split(out[1])[1]
1015 ip = string.split(addr, ':')[1]
1018 def def_mount_options(fstype, target):
1019 """returns deafult mount options for passed fstype and target (mds, ost)"""
1020 if fstype == 'ext3' or fstype == 'ldiskfs':
1021 mountfsoptions = "errors=remount-ro"
1022 if target == 'ost' and sys_get_branch() == '2.4':
1023 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
1024 return mountfsoptions
1027 def sys_get_elan_position_file():
1028 procfiles = ["/proc/elan/device0/position",
1029 "/proc/qsnet/elan4/device0/position",
1030 "/proc/qsnet/elan3/device0/position"]
1032 if os.access(p, os.R_OK):
1036 def sys_get_local_nid(net_type, wildcard, cluster_id):
1037 """Return the local nid."""
1039 if sys_get_elan_position_file():
1040 local = sys_get_local_address('elan', '*', cluster_id)
1042 local = sys_get_local_address(net_type, wildcard, cluster_id)
1045 def sys_get_local_address(net_type, wildcard, cluster_id):
1046 """Return the local address for the network type."""
1048 if net_type in ('tcp','openib','iib','ra'):
1050 iface, star = string.split(wildcard, ':')
1051 local = if2addr(iface)
1053 panic ("unable to determine ip for:", wildcard)
1055 host = socket.gethostname()
1056 local = socket.gethostbyname(host)
1057 elif net_type == 'elan':
1058 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
1059 f = sys_get_elan_position_file()
1061 panic ("unable to determine local Elan ID")
1064 lines = fp.readlines()
1068 if a[0] == 'NodeId':
1072 nid = my_int(cluster_id) + my_int(elan_id)
1073 local = "%d" % (nid)
1074 except ValueError, e:
1078 elif net_type == 'lo':
1079 fixme("automatic local address for loopback")
1080 elif net_type == 'gm':
1081 fixme("automatic local address for GM")
1085 def sys_get_branch():
1086 """Returns kernel release"""
1088 fp = open('/proc/sys/kernel/osrelease')
1089 lines = fp.readlines()
1093 version = string.split(l)
1094 a = string.split(version[0], '.')
1095 return a[0] + '.' + a[1]
1100 # XXX: instead of device_list, ask for $name and see what we get
1101 def is_prepared(name):
1102 """Return true if a device exists for the name"""
1103 if config.lctl_dump:
1105 if (config.noexec or config.record) and config.cleanup:
1108 # expect this format:
1109 # 1 UP ldlm ldlm ldlm_UUID 2
1110 out = lctl.device_list()
1112 if name == string.split(s)[3]:
1114 except CommandError, e:
1118 def net_is_prepared():
1119 """If the any device exists, then assume that all networking
1120 has been configured"""
1121 out = lctl.device_list()
1124 def fs_is_mounted(path):
1125 """Return true if path is a mounted lustre filesystem"""
1127 fp = open('/proc/mounts')
1128 lines = fp.readlines()
1132 if a[1] == path and a[2] == 'lustre_lite':
1138 def kmod_find(src_dir, dev_dir, modname):
1139 modbase = src_dir +'/'+ dev_dir +'/'+ modname
1140 for modext in '.ko', '.o':
1141 module = modbase + modext
1143 if os.access(module, os.R_OK):
1149 def kmod_info(modname):
1150 """Returns reference count for passed module name."""
1152 fp = open('/proc/modules')
1153 lines = fp.readlines()
1156 # please forgive my tired fingers for this one
1157 ret = filter(lambda word, mod = modname: word[0] == mod,
1158 map(lambda line: string.split(line), lines))
1162 except Exception, e:
1166 """Presents kernel module"""
1167 def __init__(self, src_dir, dev_dir, name):
1168 self.src_dir = src_dir
1169 self.dev_dir = dev_dir
1174 log ('loading module:', self.name, 'srcdir',
1175 self.src_dir, 'devdir', self.dev_dir)
1177 module = kmod_find(self.src_dir, self.dev_dir,
1180 panic('module not found:', self.name)
1181 (rc, out) = run('/sbin/insmod', module)
1183 raise CommandError('insmod', out, rc)
1185 (rc, out) = run('/sbin/modprobe', self.name)
1187 raise CommandError('modprobe', out, rc)
1191 log('unloading module:', self.name)
1192 (rc, out) = run('/sbin/rmmod', self.name)
1194 log('unable to unload module:', self.name +
1195 "(" + self.refcount() + ")")
1199 """Returns module info if any."""
1200 return kmod_info(self.name)
1203 """Returns 1 if module is loaded. Otherwise 0 is returned."""
1210 """Returns module refcount."""
1217 """Returns 1 if module is used, otherwise 0 is returned."""
1223 if users and users != '(unused)' and users != '-':
1231 """Returns 1 if module is busy, otherwise 0 is returned."""
1232 if self.loaded() and (self.used() or self.refcount() != '0'):
1238 """Manage kernel modules"""
1239 def __init__(self, lustre_dir, portals_dir):
1240 self.lustre_dir = lustre_dir
1241 self.portals_dir = portals_dir
1242 self.kmodule_list = []
1244 def find_module(self, modname):
1245 """Find module by module name"""
1246 for mod in self.kmodule_list:
1247 if mod.name == modname:
1251 def add_portals_module(self, dev_dir, modname):
1252 """Append a module to list of modules to load."""
1254 mod = self.find_module(modname)
1256 mod = kmod(self.portals_dir, dev_dir, modname)
1257 self.kmodule_list.append(mod)
1259 def add_lustre_module(self, dev_dir, modname):
1260 """Append a module to list of modules to load."""
1262 mod = self.find_module(modname)
1264 mod = kmod(self.lustre_dir, dev_dir, modname)
1265 self.kmodule_list.append(mod)
1267 def load_modules(self):
1268 """Load all the modules in the list in the order they appear."""
1269 for mod in self.kmodule_list:
1270 if mod.loaded() and not config.noexec:
1274 def cleanup_modules(self):
1275 """Unload the modules in the list in reverse order."""
1276 rev = self.kmodule_list
1279 if (not mod.loaded() or mod.busy()) and not config.noexec:
1282 if mod.name == 'portals' and config.dump:
1283 lctl.dump(config.dump)
1286 # ============================================================
1287 # Classes to prepare and cleanup the various objects
1290 """ Base class for the rest of the modules. The default cleanup method is
1291 defined here, as well as some utilitiy funcs.
1293 def __init__(self, module_name, db):
1295 self.module_name = module_name
1296 self.name = self.db.getName()
1297 self.uuid = self.db.getUUID()
1301 def info(self, *args):
1302 msg = string.join(map(str,args))
1303 print self.module_name + ":", self.name, self.uuid, msg
1306 """ default cleanup, used for most modules """
1309 lctl.cleanup(self.name, self.uuid, config.force)
1310 except CommandError, e:
1311 log(self.module_name, "cleanup failed: ", self.name)
1315 def add_module(self, manager):
1316 """Adds all needed modules in the order they appear."""
1319 def safe_to_clean(self):
1322 def safe_to_clean_modules(self):
1323 return self.safe_to_clean()
1325 class Network(Module):
1326 def __init__(self,db):
1327 Module.__init__(self, 'NETWORK', db)
1328 self.net_type = self.db.get_val('nettype')
1329 self.nid = self.db.get_val('nid', '*')
1330 self.cluster_id = self.db.get_val('clusterid', "0")
1331 self.port = self.db.get_val_int('port', 0)
1334 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1336 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1337 self.generic_nid = 1
1338 debug("nid:", self.nid)
1340 self.generic_nid = 0
1342 self.nid_uuid = self.nid_to_uuid(self.nid)
1343 self.hostaddr = self.db.get_hostaddr()
1344 if len(self.hostaddr) == 0:
1345 self.hostaddr.append(self.nid)
1346 if '*' in self.hostaddr[0]:
1347 self.hostaddr[0] = sys_get_local_address(self.net_type, self.hostaddr[0], self.cluster_id)
1348 if not self.hostaddr[0]:
1349 panic("unable to set hostaddr for", self.net_type, self.hostaddr[0], self.cluster_id)
1350 debug("hostaddr:", self.hostaddr[0])
1352 def add_module(self, manager):
1353 manager.add_portals_module("libcfs", 'libcfs')
1354 manager.add_portals_module("portals", 'portals')
1355 if node_needs_router():
1356 manager.add_portals_module("router", 'kptlrouter')
1357 if self.net_type == 'tcp':
1358 manager.add_portals_module("knals/socknal", 'ksocknal')
1359 if self.net_type == 'elan':
1360 manager.add_portals_module("knals/qswnal", 'kqswnal')
1361 if self.net_type == 'gm':
1362 manager.add_portals_module("knals/gmnal", 'kgmnal')
1363 if self.net_type == 'openib':
1364 manager.add_portals_module("knals/openibnal", 'kopenibnal')
1365 if self.net_type == 'iib':
1366 manager.add_portals_module("knals/iibnal", 'kiibnal')
1367 if self.net_type == 'lo':
1368 manager.add_portals_module("knals/lonal", 'klonal')
1369 if self.net_type == 'ra':
1370 manager.add_portals_module("knals/ranal", 'kranal')
1372 def nid_to_uuid(self, nid):
1373 return "NID_%s_UUID" %(nid,)
1376 if not config.record and net_is_prepared():
1378 self.info(self.net_type, self.nid, self.port)
1379 if not (config.record and self.generic_nid):
1380 lctl.network(self.net_type, self.nid)
1381 if self.net_type == 'tcp':
1383 for hostaddr in self.db.get_hostaddr():
1384 ip = string.split(hostaddr, '/')[0]
1385 if len(string.split(hostaddr, '/')) == 2:
1386 netmask = string.split(hostaddr, '/')[1]
1389 lctl.add_interface(self.net_type, ip, netmask)
1390 if self.net_type == 'elan':
1392 if self.port and node_is_router():
1393 run_one_acceptor(self.port)
1394 self.connect_peer_gateways()
1396 def connect_peer_gateways(self):
1397 for router in self.db.lookup_class('node'):
1398 if router.get_val_int('router', 0):
1399 for netuuid in router.get_networks():
1400 net = self.db.lookup(netuuid)
1402 if (gw.cluster_id == self.cluster_id and
1403 gw.net_type == self.net_type):
1404 if gw.nid != self.nid:
1407 def disconnect_peer_gateways(self):
1408 for router in self.db.lookup_class('node'):
1409 if router.get_val_int('router', 0):
1410 for netuuid in router.get_networks():
1411 net = self.db.lookup(netuuid)
1413 if (gw.cluster_id == self.cluster_id and
1414 gw.net_type == self.net_type):
1415 if gw.nid != self.nid:
1418 except CommandError, e:
1419 print "disconnect failed: ", self.name
1423 def safe_to_clean(self):
1424 return not net_is_prepared()
1427 self.info(self.net_type, self.nid, self.port)
1429 stop_acceptor(self.port)
1430 if node_is_router():
1431 self.disconnect_peer_gateways()
1432 if self.net_type == 'tcp':
1433 for hostaddr in self.db.get_hostaddr():
1434 ip = string.split(hostaddr, '/')[0]
1435 lctl.del_interface(self.net_type, ip)
1437 def correct_level(self, level, op=None):
1440 class RouteTable(Module):
1441 def __init__(self,db):
1442 Module.__init__(self, 'ROUTES', db)
1444 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1446 # only setup connections for tcp, openib, and iib NALs
1448 if not net_type in ('tcp','openib','iib','ra'):
1451 # connect to target if route is to single node and this node is the gw
1452 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1453 if not local_cluster(net_type, tgt_cluster_id):
1454 panic("target", lo, " not on the local cluster")
1455 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1456 # connect to gateway if this node is not the gw
1457 elif (local_cluster(net_type, gw_cluster_id)
1458 and not local_interface(net_type, gw_cluster_id, gw)):
1459 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1464 panic("no server for nid", lo)
1467 return Network(srvdb)
1470 if not config.record and net_is_prepared():
1473 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1474 lctl.add_route(net_type, gw, lo, hi)
1475 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1479 def safe_to_clean(self):
1480 return not net_is_prepared()
1483 if net_is_prepared():
1484 # the network is still being used, don't clean it up
1486 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1487 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1490 lctl.disconnect(srv)
1491 except CommandError, e:
1492 print "disconnect failed: ", self.name
1497 lctl.del_route(net_type, gw, lo, hi)
1498 except CommandError, e:
1499 print "del_route failed: ", self.name
1503 class Management(Module):
1504 def __init__(self, db):
1505 Module.__init__(self, 'MGMT', db)
1507 def add_module(self, manager):
1508 manager.add_lustre_module('lvfs', 'lvfs')
1509 manager.add_lustre_module('obdclass', 'obdclass')
1510 manager.add_lustre_module('ptlrpc', 'ptlrpc')
1511 manager.add_lustre_module('mgmt', 'mgmt_svc')
1514 if not config.record and is_prepared(self.name):
1517 lctl.newdev("mgmt", self.name, self.uuid)
1519 def safe_to_clean(self):
1523 if is_prepared(self.name):
1524 Module.cleanup(self)
1526 def correct_level(self, level, op=None):
1529 # This is only needed to load the modules; the LDLM device
1530 # is now created automatically.
1532 def __init__(self,db):
1533 Module.__init__(self, 'LDLM', db)
1535 def add_module(self, manager):
1536 manager.add_lustre_module('lvfs', 'lvfs')
1537 manager.add_lustre_module('obdclass', 'obdclass')
1538 manager.add_lustre_module('ptlrpc', 'ptlrpc')
1546 def correct_level(self, level, op=None):
1550 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1551 Module.__init__(self, 'LOV', db)
1552 if name_override != None:
1553 self.name = "lov_%s" % name_override
1554 self.mds_uuid = self.db.get_first_ref('mds')
1555 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1556 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1557 self.pattern = self.db.get_val_int('stripepattern', 0)
1558 self.devlist = self.db.get_lov_tgts('lov_tgt')
1559 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1562 self.desc_uuid = self.uuid
1563 self.uuid = generate_client_uuid(self.name)
1564 self.fs_name = fs_name
1566 self.config_only = 1
1568 self.config_only = None
1569 mds = self.db.lookup(self.mds_uuid)
1570 self.mds_name = mds.getName()
1571 for (obd_uuid, index, gen, active) in self.devlist:
1574 self.obdlist.append(obd_uuid)
1575 obd = self.db.lookup(obd_uuid)
1576 osc = get_osc(obd, self.uuid, fs_name)
1578 self.osclist.append((osc, index, gen, active))
1580 panic('osc not found:', obd_uuid)
1586 if not config.record and is_prepared(self.name):
1588 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1589 self.stripe_off, self.pattern, self.devlist,
1591 lctl.lov_setup(self.name, self.uuid, self.desc_uuid, self.stripe_cnt,
1592 self.stripe_sz, self.stripe_off, self.pattern,
1593 string.join(self.obdlist))
1594 for (osc, index, gen, active) in self.osclist:
1595 target_uuid = osc.target_uuid
1597 # Only ignore connect failures with --force, which
1598 # isn't implemented here yet.
1600 osc.prepare(ignore_connect_failure=0)
1601 except CommandError, e:
1602 print "Error preparing OSC %s\n" % osc.uuid
1604 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1607 for (osc, index, gen, active) in self.osclist:
1608 target_uuid = osc.target_uuid
1610 if is_prepared(self.name):
1611 Module.cleanup(self)
1612 if self.config_only:
1613 panic("Can't clean up config_only LOV ", self.name)
1615 def add_module(self, manager):
1616 if self.config_only:
1617 panic("Can't load modules for config_only LOV ", self.name)
1618 for (osc, index, gen, active) in self.osclist:
1619 osc.add_module(manager)
1621 manager.add_lustre_module('lov', 'lov')
1623 def correct_level(self, level, op=None):
1627 def __init__(self, db, uuid, fs_name, name_override = None):
1628 Module.__init__(self, 'LMV', db)
1629 if name_override != None:
1630 self.name = "lmv_%s" % name_override
1632 self.devlist = self.db.get_lmv_tgts('lmv_tgt')
1633 if self.devlist == None:
1634 self.devlist = self.db.get_refs('mds')
1637 self.desc_uuid = self.uuid
1639 self.fs_name = fs_name
1640 for mds_uuid in self.devlist:
1641 mds = self.db.lookup(mds_uuid)
1643 panic("MDS not found!")
1644 mdc = MDC(mds, self.uuid, fs_name)
1646 self.mdclist.append(mdc)
1648 panic('mdc not found:', mds_uuid)
1651 if is_prepared(self.name):
1655 for mdc in self.mdclist:
1657 # Only ignore connect failures with --force, which
1658 # isn't implemented here yet.
1659 mdc.prepare(ignore_connect_failure=0)
1660 except CommandError, e:
1661 print "Error preparing LMV %s\n" % mdc.uuid
1664 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1665 string.join(self.devlist))
1668 for mdc in self.mdclist:
1670 if is_prepared(self.name):
1671 Module.cleanup(self)
1673 def add_module(self, manager):
1674 for mdc in self.mdclist:
1675 mdc.add_module(manager)
1677 manager.add_lustre_module('lmv', 'lmv')
1679 def correct_level(self, level, op=None):
1682 class MDSDEV(Module):
1683 def __init__(self,db):
1684 Module.__init__(self, 'MDSDEV', db)
1685 self.devpath = self.db.get_val('devpath','')
1686 self.backdevpath = self.db.get_val('backdevpath','')
1687 self.size = self.db.get_val_int('devsize', 0)
1688 self.journal_size = self.db.get_val_int('journalsize', 0)
1689 self.fstype = self.db.get_val('fstype', '')
1690 self.backfstype = self.db.get_val('backfstype', '')
1691 self.nspath = self.db.get_val('nspath', '')
1692 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1693 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1694 self.obdtype = self.db.get_val('obdtype', '')
1695 self.root_squash = self.db.get_val('root_squash', '')
1696 self.no_root_squash = self.db.get_val('no_root_squash', '')
1697 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1698 target_uuid = self.db.get_first_ref('target')
1699 self.mds = self.db.lookup(target_uuid)
1700 self.name = self.mds.getName()
1701 self.client_uuids = self.mds.get_refs('client')
1706 lmv_uuid = self.db.get_first_ref('lmv')
1707 if lmv_uuid != None:
1708 self.lmv = self.db.lookup(lmv_uuid)
1709 if self.lmv != None:
1710 self.client_uuids = self.lmv.get_refs('client')
1712 # FIXME: if fstype not set, then determine based on kernel version
1713 self.format = self.db.get_val('autoformat', "no")
1714 if self.mds.get_val('failover', 0):
1715 self.failover_mds = 'f'
1717 self.failover_mds = 'n'
1718 active_uuid = get_active_target(self.mds)
1720 panic("No target device found:", target_uuid)
1721 if active_uuid == self.uuid:
1725 if self.active and config.group and config.group != self.mds.get_val('group'):
1728 # default inode inode for case when neither LOV either
1729 # LMV is accessible.
1730 self.inode_size = 256
1732 inode_size = self.db.get_val_int('inodesize', 0)
1733 if not inode_size == 0:
1734 self.inode_size = inode_size
1736 # find the LOV for this MDS
1737 lovconfig_uuid = self.mds.get_first_ref('lovconfig')
1738 if lovconfig_uuid or self.lmv != None:
1739 if self.lmv != None:
1740 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1741 lovconfig = self.lmv.lookup(lovconfig_uuid)
1742 lov_uuid = lovconfig.get_first_ref('lov')
1743 if lov_uuid == None:
1744 panic(self.mds.getName() + ": No LOV found for lovconfig ",
1747 lovconfig = self.mds.lookup(lovconfig_uuid)
1748 lov_uuid = lovconfig.get_first_ref('lov')
1749 if lov_uuid == None:
1750 panic(self.mds.getName() + ": No LOV found for lovconfig ",
1753 if self.lmv != None:
1754 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1755 lovconfig = self.lmv.lookup(lovconfig_uuid)
1756 lov_uuid = lovconfig.get_first_ref('lov')
1758 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, self.name,
1761 # default stripe count controls default inode_size
1762 stripe_count = lov.stripe_cnt
1763 if stripe_count > 77:
1764 self.inode_size = 4096
1765 elif stripe_count > 35:
1766 self.inode_size = 2048
1767 elif stripe_count > 13:
1768 self.inode_size = 1024
1769 elif stripe_count > 3:
1770 self.inode_size = 512
1772 self.inode_size = 256
1774 self.target_dev_uuid = self.uuid
1775 self.uuid = target_uuid
1778 if self.lmv != None:
1779 client_uuid = self.name + "_lmv_UUID"
1780 self.master = LMV(self.lmv, client_uuid,
1781 self.name, self.name)
1783 def add_module(self, manager):
1785 manager.add_lustre_module('mdc', 'mdc')
1786 manager.add_lustre_module('osc', 'osc')
1787 manager.add_lustre_module('ost', 'ost')
1788 manager.add_lustre_module('lov', 'lov')
1789 manager.add_lustre_module('mds', 'mds')
1791 if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
1792 manager.add_lustre_module(self.fstype, self.fstype)
1795 manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1797 # if fstype is smfs, then we should also take care about backing
1799 if self.fstype == 'smfs':
1800 manager.add_lustre_module(self.backfstype, self.backfstype)
1801 manager.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1803 for option in string.split(self.mountfsoptions, ','):
1804 if option == 'snap':
1805 if not self.fstype == 'smfs':
1806 panic("mountoptions has 'snap', but fstype is not smfs.")
1807 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
1808 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
1811 if self.master != None:
1812 self.master.add_module(manager)
1814 def get_mount_options(self, blkdev):
1815 options = def_mount_options(self.fstype, 'mds')
1817 if config.mountfsoptions:
1819 options = "%s,%s" %(options, config.mountfsoptions)
1821 options = config.mountfsoptions
1822 if self.mountfsoptions:
1823 options = "%s,%s" %(options, self.mountfsoptions)
1825 if self.mountfsoptions:
1827 options = "%s,%s" %(options, self.mountfsoptions)
1829 options = self.mountfsoptions
1831 if self.fstype == 'smfs':
1833 options = "%s,type=%s,dev=%s" %(options,
1834 self.backfstype, blkdev)
1836 options = "type=%s,dev=%s" %(self.backfstype, blkdev)
1840 if not config.record and is_prepared(self.name):
1843 debug(self.uuid, "not active")
1846 # run write_conf automatically, if --reformat used
1851 if self.master != None:
1852 self.master.prepare()
1854 # never reformat here
1855 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1856 self.format, self.journal_size, self.inode_size,
1857 self.mkfsoptions, self.backfstype, self.backdevpath)
1859 if not is_prepared('MDT'):
1860 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1862 if self.fstype == 'smfs':
1863 realdev = self.fstype
1867 if self.obdtype == None:
1868 self.obdtype = 'dumb'
1870 if self.master == None:
1871 master_name = 'dumb'
1873 master_name = self.master.name
1875 if self.client_uuids == None:
1876 profile_name = 'dumb'
1878 profile_name = self.name
1880 mountfsoptions = self.get_mount_options(blkdev)
1882 self.info("mds", realdev, mountfsoptions, self.fstype, self.size,
1883 self.format, master_name, profile_name, self.obdtype)
1885 lctl.newdev("mds", self.name, self.uuid,
1886 setup = "%s %s %s %s %s %s" %(realdev,
1887 self.fstype, profile_name, mountfsoptions,
1888 master_name, self.obdtype))
1890 if development_mode():
1891 procentry = "/proc/fs/lustre/mds/grp_hash_upcall"
1892 upcall = os.path.abspath(os.path.dirname(sys.argv[0]) + "/l_getgroups")
1893 if not (os.access(procentry, os.R_OK) and os.access(upcall, os.R_OK)):
1894 print "MDS Warning: failed to set group-hash upcall"
1896 run("echo ", upcall, " > ", procentry)
1898 except CommandError, e:
1900 panic("MDS is missing the config log. Need to run " +
1901 "lconf --write_conf.")
1905 if config.root_squash == None:
1906 config.root_squash = self.root_squash
1907 if config.no_root_squash == None:
1908 config.no_root_squash = self.no_root_squash
1909 if config.root_squash:
1910 if config.no_root_squash:
1911 nsnid = config.no_root_squash
1914 lctl.root_squash(self.name, config.root_squash, nsnid)
1916 def write_conf(self):
1917 if not self.client_uuids:
1921 if not is_prepared(self.name):
1922 blkdev = block_dev(self.devpath, self.size, self.fstype,
1923 config.reformat, self.format, self.journal_size,
1924 self.inode_size, self.mkfsoptions,
1925 self.backfstype, self.backdevpath)
1927 if self.fstype == 'smfs':
1928 realdev = self.fstype
1932 # Even for writing logs we mount mds with supplied mount options
1933 # because it will not mount smfs (if used) otherwise.
1934 mountfsoptions = self.get_mount_options(blkdev)
1936 if self.obdtype == None:
1937 self.obdtype = 'dumb'
1939 self.info("mds", realdev, mountfsoptions, self.fstype, self.size,
1940 self.format, "dumb", "dumb", self.obdtype)
1942 lctl.newdev("mds", self.name, self.uuid,
1943 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1944 'dumb', mountfsoptions,
1945 'dumb', self.obdtype))
1948 # record logs for all MDS clients
1949 for obd_uuid in self.client_uuids:
1950 log("recording client:", obd_uuid)
1952 client_uuid = generate_client_uuid(self.name)
1953 client = VOSC(self.db.lookup(obd_uuid), client_uuid,
1954 self.name, self.name)
1956 lctl.clear_log(self.name, self.name)
1957 lctl.record(self.name, self.name)
1959 lctl.mount_option(self.name, client.get_name(), "")
1961 process_updates(self.db, self.name, self.name, client)
1964 lctl.clear_log(self.name, self.name + '-clean')
1965 lctl.record(self.name, self.name + '-clean')
1967 lctl.del_mount_option(self.name)
1969 process_updates(self.db, self.name, self.name + '-clean', client)
1973 # record logs for each client
1979 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1981 config_options = CONFIG_FILE
1983 for node_db in self.db.lookup_class('node'):
1984 client_name = node_db.getName()
1985 for prof_uuid in node_db.get_refs('profile'):
1986 prof_db = node_db.lookup(prof_uuid)
1987 # refactor this into a funtion to test "clientness"
1989 for ref_class, ref_uuid in prof_db.get_all_refs():
1990 if ref_class in ('mountpoint','echoclient'):
1991 debug("recording", client_name)
1992 old_noexec = config.noexec
1994 ret, out = run (sys.argv[0], noexec_opt,
1995 " -v --record --nomod",
1996 "--record_log", client_name,
1997 "--record_device", self.name,
1998 "--node", client_name,
2001 for s in out: log("record> ", string.strip(s))
2002 ret, out = run (sys.argv[0], noexec_opt,
2003 "--cleanup -v --record --nomod",
2004 "--record_log", client_name + "-clean",
2005 "--record_device", self.name,
2006 "--node", client_name,
2009 for s in out: log("record> ", string.strip(s))
2010 config.noexec = old_noexec
2013 lctl.cleanup(self.name, self.uuid, 0, 0)
2014 except CommandError, e:
2015 log(self.module_name, "cleanup failed: ", self.name)
2018 Module.cleanup(self)
2020 clean_dev(self.devpath, self.fstype, self.backfstype,
2023 def msd_remaining(self):
2024 out = lctl.device_list()
2026 if string.split(s)[2] in ('mds',):
2029 def safe_to_clean(self):
2032 def safe_to_clean_modules(self):
2033 return not self.msd_remaining()
2037 debug(self.uuid, "not active")
2040 if is_prepared(self.name):
2042 lctl.cleanup(self.name, self.uuid, config.force,
2044 except CommandError, e:
2045 log(self.module_name, "cleanup failed: ", self.name)
2048 Module.cleanup(self)
2050 if self.master != None:
2051 self.master.cleanup()
2052 if not self.msd_remaining() and is_prepared('MDT'):
2054 lctl.cleanup("MDT", "MDT_UUID", config.force,
2056 except CommandError, e:
2057 print "cleanup failed: ", self.name
2061 clean_dev(self.devpath, self.fstype, self.backfstype,
2064 def correct_level(self, level, op=None):
2065 #if self.master != None:
2070 def __init__(self, db):
2071 Module.__init__(self, 'OSD', db)
2072 self.osdtype = self.db.get_val('osdtype')
2073 self.devpath = self.db.get_val('devpath', '')
2074 self.backdevpath = self.db.get_val('backdevpath', '')
2075 self.size = self.db.get_val_int('devsize', 0)
2076 self.journal_size = self.db.get_val_int('journalsize', 0)
2077 self.inode_size = self.db.get_val_int('inodesize', 0)
2078 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
2079 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
2080 self.fstype = self.db.get_val('fstype', '')
2081 self.backfstype = self.db.get_val('backfstype', '')
2082 self.nspath = self.db.get_val('nspath', '')
2083 target_uuid = self.db.get_first_ref('target')
2084 ost = self.db.lookup(target_uuid)
2085 self.name = ost.getName()
2086 self.format = self.db.get_val('autoformat', 'yes')
2087 if ost.get_val('failover', 0):
2088 self.failover_ost = 'f'
2090 self.failover_ost = 'n'
2092 active_uuid = get_active_target(ost)
2094 panic("No target device found:", target_uuid)
2095 if active_uuid == self.uuid:
2099 if self.active and config.group and config.group != ost.get_val('group'):
2102 self.target_dev_uuid = self.uuid
2103 self.uuid = target_uuid
2105 def add_module(self, manager):
2107 manager.add_lustre_module('ost', 'ost')
2109 if self.fstype == 'smfs' or self.fstype == 'ldiskfs':
2110 manager.add_lustre_module(self.fstype, self.fstype)
2113 manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
2115 if self.fstype == 'smfs':
2116 manager.add_lustre_module(self.backfstype, self.backfstype)
2117 manager.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
2119 for option in self.mountfsoptions:
2120 if option == 'snap':
2121 if not self.fstype == 'smfs':
2122 panic("mountoptions with snap, but fstype is not smfs\n")
2123 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.fstype))
2124 manager.add_lustre_module('lvfs', 'fsfilt_snap_%s' % (self.backfstype))
2126 manager.add_lustre_module(self.osdtype, self.osdtype)
2128 def get_mount_options(self, blkdev):
2129 options = def_mount_options(self.fstype, 'ost')
2131 if config.mountfsoptions:
2133 options = "%s,%s" %(options, config.mountfsoptions)
2135 options = config.mountfsoptions
2136 if self.mountfsoptions:
2137 options = "%s,%s" %(options, self.mountfsoptions)
2139 if self.mountfsoptions:
2141 options = "%s,%s" %(options, self.mountfsoptions)
2143 options = self.mountfsoptions
2145 if self.fstype == 'smfs':
2147 options = "%s,type=%s,dev=%s" %(options,
2148 self.backfstype, blkdev)
2150 options = "type=%s,dev=%s" %(self.backfstype,
2154 # need to check /proc/mounts and /etc/mtab before
2155 # formatting anything.
2156 # FIXME: check if device is already formatted.
2158 if is_prepared(self.name):
2161 debug(self.uuid, "not active")
2164 if self.osdtype == 'obdecho':
2167 blkdev = block_dev(self.devpath, self.size, self.fstype,
2168 config.reformat, self.format, self.journal_size,
2169 self.inode_size, self.mkfsoptions, self.backfstype,
2172 if self.fstype == 'smfs':
2173 realdev = self.fstype
2177 mountfsoptions = self.get_mount_options(blkdev)
2179 self.info(self.osdtype, realdev, mountfsoptions, self.fstype,
2180 self.size, self.format, self.journal_size, self.inode_size)
2182 lctl.newdev(self.osdtype, self.name, self.uuid,
2183 setup ="%s %s %s %s" %(realdev, self.fstype,
2186 if not is_prepared('OSS'):
2187 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
2189 def osd_remaining(self):
2190 out = lctl.device_list()
2192 if string.split(s)[2] in ('obdfilter', 'obdecho'):
2195 def safe_to_clean(self):
2198 def safe_to_clean_modules(self):
2199 return not self.osd_remaining()
2203 debug(self.uuid, "not active")
2205 if is_prepared(self.name):
2208 lctl.cleanup(self.name, self.uuid, config.force,
2210 except CommandError, e:
2211 log(self.module_name, "cleanup failed: ", self.name)
2214 if not self.osd_remaining() and is_prepared('OSS'):
2216 lctl.cleanup("OSS", "OSS_UUID", config.force,
2218 except CommandError, e:
2219 print "cleanup failed: ", self.name
2222 if not self.osdtype == 'obdecho':
2223 clean_dev(self.devpath, self.fstype, self.backfstype,
2226 def correct_level(self, level, op=None):
2229 def mgmt_uuid_for_fs(mtpt_name):
2232 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2233 fs_uuid = mtpt_db.get_first_ref('filesystem')
2234 fs = toplustreDB.lookup(fs_uuid)
2237 return fs.get_first_ref('mgmt')
2239 # Generic client module, used by OSC and MDC
2240 class Client(Module):
2241 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2243 self.target_name = tgtdb.getName()
2244 self.target_uuid = tgtdb.getUUID()
2245 self.module_dir = module_dir
2246 self.module = module
2250 self.tgt_dev_uuid = get_active_target(tgtdb)
2251 if not self.tgt_dev_uuid:
2252 panic("No target device found for target(1):", self.target_name)
2257 self.module = module
2258 self.module_name = string.upper(module)
2260 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2261 self.target_name, fs_name)
2263 self.name = self_name
2265 self.lookup_server(self.tgt_dev_uuid)
2266 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2268 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2271 self.fs_name = fs_name
2272 if not self.module_dir:
2273 self.module_dir = module
2275 def add_module(self, manager):
2276 manager.add_lustre_module(self.module_dir, self.module)
2278 def lookup_server(self, srv_uuid):
2279 """ Lookup a server's network information """
2280 self._server_nets = get_ost_net(self.db, srv_uuid)
2281 if len(self._server_nets) == 0:
2282 panic ("Unable to find a server for:", srv_uuid)
2287 def get_servers(self):
2288 return self._server_nets
2290 def prepare(self, ignore_connect_failure = 0):
2291 self.info(self.target_uuid)
2292 if not config.record and is_prepared(self.name):
2295 srv = choose_local_server(self.get_servers())
2299 routes = find_route(self.get_servers())
2300 if len(routes) == 0:
2301 panic ("no route to", self.target_uuid)
2302 for (srv, r) in routes:
2303 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2304 except CommandError, e:
2305 if not ignore_connect_failure:
2308 if self.permits_inactive() and (self.target_uuid in config.inactive or self.active == 0):
2309 debug("%s inactive" % self.target_uuid)
2310 inactive_p = "inactive"
2312 debug("%s active" % self.target_uuid)
2314 lctl.newdev(self.module, self.name, self.uuid,
2315 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2316 inactive_p, self.mgmt_name))
2319 if is_prepared(self.name):
2320 Module.cleanup(self)
2322 srv = choose_local_server(self.get_servers())
2324 lctl.disconnect(srv)
2326 for (srv, r) in find_route(self.get_servers()):
2327 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2328 except CommandError, e:
2329 log(self.module_name, "cleanup failed: ", self.name)
2333 def correct_level(self, level, op=None):
2336 def deactivate(self):
2338 lctl.deactivate(self.name)
2339 except CommandError, e:
2340 log(self.module_name, "deactivate failed: ", self.name)
2345 def __init__(self, db, uuid, fs_name):
2346 Client.__init__(self, db, uuid, 'mdc', fs_name)
2348 def permits_inactive(self):
2352 def __init__(self, db, uuid, fs_name):
2353 Client.__init__(self, db, uuid, 'osc', fs_name)
2355 def permits_inactive(self):
2358 def mgmtcli_name_for_uuid(uuid):
2359 return 'MGMTCLI_%s' % uuid
2361 class ManagementClient(Client):
2362 def __init__(self, db, uuid):
2363 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2364 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2365 module_dir = 'mgmt')
2367 class CMOBD(Module):
2368 def __init__(self, db):
2369 Module.__init__(self, 'CMOBD', db)
2370 self.name = self.db.getName();
2371 self.uuid = generate_client_uuid(self.name)
2372 self.master_uuid = self.db.get_first_ref('masterobd')
2373 self.cache_uuid = self.db.get_first_ref('cacheobd')
2375 master_obd = self.db.lookup(self.master_uuid)
2377 panic('master obd not found:', self.master_uuid)
2379 cache_obd = self.db.lookup(self.cache_uuid)
2381 panic('cache obd not found:', self.cache_uuid)
2386 master_class = master_obd.get_class()
2387 cache_class = cache_obd.get_class()
2389 if master_class == 'ost' or master_class == 'lov':
2390 client_uuid = "%s_lov_master_UUID" % (self.name)
2391 self.master = LOV(master_obd, client_uuid, self.name);
2392 elif master_class == 'mds':
2393 self.master = get_mdc(db, self.name, self.master_uuid)
2394 elif master_class == 'lmv':
2395 client_uuid = "%s_lmv_master_UUID" % (self.name)
2396 self.master = LMV(master_obd, client_uuid, self.name);
2398 panic("unknown master obd class '%s'" %(master_class))
2400 if cache_class == 'ost' or cache_class == 'lov':
2401 client_uuid = "%s_lov_cache_UUID" % (self.name)
2402 self.cache = LOV(cache_obd, client_uuid, self.name);
2403 elif cache_class == 'mds':
2404 self.cache = get_mdc(db, self.name, self.cache_uuid)
2405 elif cache_class == 'lmv':
2406 client_uuid = "%s_lmv_cache_UUID" % (self.name)
2407 self.cache = LMV(cache_obd, client_uuid, self.name);
2409 panic("unknown cache obd class '%s'" %(cache_class))
2412 self.master.prepare()
2413 if not config.record and is_prepared(self.name):
2415 self.info(self.master_uuid, self.cache_uuid)
2416 lctl.newdev("cmobd", self.name, self.uuid,
2417 setup ="%s %s" %(self.master.uuid,
2426 def get_master_name(self):
2427 return self.master.name
2429 def get_cache_name(self):
2430 return self.cache.name
2433 if is_prepared(self.name):
2434 Module.cleanup(self)
2436 self.master.cleanup()
2438 def add_module(self, manager):
2439 manager.add_lustre_module('cmobd', 'cmobd')
2440 self.master.add_module(manager)
2442 def correct_level(self, level, op=None):
2446 def __init__(self, db, uuid, name):
2447 Module.__init__(self, 'COBD', db)
2448 self.name = self.db.getName();
2449 self.uuid = generate_client_uuid(self.name)
2450 self.master_uuid = self.db.get_first_ref('masterobd')
2451 self.cache_uuid = self.db.get_first_ref('cacheobd')
2453 master_obd = self.db.lookup(self.master_uuid)
2455 panic('master obd not found:', self.master_uuid)
2457 cache_obd = self.db.lookup(self.cache_uuid)
2459 panic('cache obd not found:', self.cache_uuid)
2464 master_class = master_obd.get_class()
2465 cache_class = cache_obd.get_class()
2467 if master_class == 'ost' or master_class == 'lov':
2468 client_uuid = "%s_lov_master_UUID" % (self.name)
2469 self.master = LOV(master_obd, client_uuid, name);
2470 elif master_class == 'mds':
2471 self.master = get_mdc(db, name, self.master_uuid)
2472 elif master_class == 'lmv':
2473 client_uuid = "%s_lmv_master_UUID" % (self.name)
2474 self.master = LMV(master_obd, client_uuid, self.name);
2476 panic("unknown master obd class '%s'" %(master_class))
2478 if cache_class == 'ost' or cache_class == 'lov':
2479 client_uuid = "%s_lov_cache_UUID" % (self.name)
2480 self.cache = LOV(cache_obd, client_uuid, name);
2481 elif cache_class == 'mds':
2482 self.cache = get_mdc(db, name, self.cache_uuid)
2483 elif cache_class == 'lmv':
2484 client_uuid = "%s_lmv_cache_UUID" % (self.name)
2485 self.cache = LMV(cache_obd, client_uuid, self.name);
2487 panic("unknown cache obd class '%s'" %(cache_class))
2495 def get_master_name(self):
2496 return self.master.name
2498 def get_cache_name(self):
2499 return self.cache.name
2502 self.master.prepare()
2503 self.cache.prepare()
2504 if not config.record and is_prepared(self.name):
2506 self.info(self.master_uuid, self.cache_uuid)
2507 lctl.newdev("cobd", self.name, self.uuid,
2508 setup ="%s %s" %(self.master.name,
2512 if is_prepared(self.name):
2513 Module.cleanup(self)
2514 self.master.cleanup()
2515 self.cache.cleanup()
2517 def add_module(self, manager):
2518 manager.add_lustre_module('cobd', 'cobd')
2519 self.master.add_module(manager)
2521 # virtual interface for OSC and LOV
2523 def __init__(self, db, client_uuid, name, name_override = None):
2524 Module.__init__(self, 'VOSC', db)
2525 if db.get_class() == 'lov':
2526 self.osc = LOV(db, client_uuid, name, name_override)
2528 elif db.get_class() == 'cobd':
2529 self.osc = COBD(db, client_uuid, name)
2532 self.osc = OSC(db, client_uuid, name)
2536 return self.osc.get_uuid()
2539 return self.osc.get_name()
2547 def add_module(self, manager):
2548 self.osc.add_module(manager)
2550 def correct_level(self, level, op=None):
2551 return self.osc.correct_level(level, op)
2553 # virtual interface for MDC and LMV
2555 def __init__(self, db, client_uuid, name, name_override = None):
2556 Module.__init__(self, 'VMDC', db)
2557 if db.get_class() == 'lmv':
2558 self.mdc = LMV(db, client_uuid, name, name_override)
2559 elif db.get_class() == 'cobd':
2560 self.mdc = COBD(db, client_uuid, name)
2562 self.mdc = MDC(db, client_uuid, name)
2565 return self.mdc.uuid
2568 return self.mdc.name
2576 def add_module(self, manager):
2577 self.mdc.add_module(manager)
2579 def correct_level(self, level, op=None):
2580 return self.mdc.correct_level(level, op)
2582 class ECHO_CLIENT(Module):
2583 def __init__(self,db):
2584 Module.__init__(self, 'ECHO_CLIENT', db)
2585 self.obd_uuid = self.db.get_first_ref('obd')
2586 obd = self.db.lookup(self.obd_uuid)
2587 self.uuid = generate_client_uuid(self.name)
2588 self.osc = VOSC(obd, self.uuid, self.name)
2591 if not config.record and is_prepared(self.name):
2594 self.osc.prepare() # XXX This is so cheating. -p
2595 self.info(self.obd_uuid)
2597 lctl.newdev("echo_client", self.name, self.uuid,
2598 setup = self.osc.get_name())
2601 if is_prepared(self.name):
2602 Module.cleanup(self)
2605 def add_module(self, manager):
2606 self.osc.add_module(manager)
2607 manager.add_lustre_module('obdecho', 'obdecho')
2609 def correct_level(self, level, op=None):
2612 def generate_client_uuid(name):
2613 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2615 int(random.random() * 1048576),
2616 int(random.random() * 1048576))
2617 return client_uuid[:36]
2619 class Mountpoint(Module):
2620 def __init__(self,db):
2621 Module.__init__(self, 'MTPT', db)
2622 self.path = self.db.get_val('path')
2623 self.clientoptions = self.db.get_val('clientoptions', '')
2624 self.fs_uuid = self.db.get_first_ref('filesystem')
2625 fs = self.db.lookup(self.fs_uuid)
2626 self.mds_uuid = fs.get_first_ref('lmv')
2627 if not self.mds_uuid:
2628 self.mds_uuid = fs.get_first_ref('mds')
2629 self.obd_uuid = fs.get_first_ref('obd')
2630 self.mgmt_uuid = fs.get_first_ref('mgmt')
2631 client_uuid = generate_client_uuid(self.name)
2633 ost = self.db.lookup(self.obd_uuid)
2635 panic("no ost: ", self.obd_uuid)
2637 mds = self.db.lookup(self.mds_uuid)
2639 panic("no mds: ", self.mds_uuid)
2641 self.vosc = VOSC(ost, client_uuid, self.name, self.name)
2642 self.vmdc = VMDC(mds, client_uuid, self.name, self.name)
2645 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2651 if not config.record and fs_is_mounted(self.path):
2652 log(self.path, "already mounted.")
2656 self.mgmtcli.prepare()
2659 vmdc_name = self.vmdc.get_name()
2661 self.info(self.path, self.mds_uuid, self.obd_uuid)
2662 if config.record or config.lctl_dump:
2663 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2666 if config.clientoptions:
2667 if self.clientoptions:
2668 self.clientoptions = self.clientoptions + ',' + \
2669 config.clientoptions
2671 self.clientoptions = config.clientoptions
2672 if self.clientoptions:
2673 self.clientoptions = ',' + self.clientoptions
2674 # Linux kernel will deal with async and not pass it to ll_fill_super,
2675 # so replace it with Lustre async
2676 self.clientoptions = string.replace(self.clientoptions, "async",
2679 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
2680 (self.vosc.get_name(), vmdc_name, self.clientoptions,
2681 config.config, self.path)
2682 run("mkdir", self.path)
2687 panic("mount failed:", self.path, ":", string.join(val))
2690 self.info(self.path, self.mds_uuid,self.obd_uuid)
2692 if config.record or config.lctl_dump:
2693 lctl.del_mount_option(local_node_name)
2695 if fs_is_mounted(self.path):
2697 (rc, out) = run("umount", "-f", self.path)
2699 (rc, out) = run("umount", self.path)
2701 raise CommandError('umount', out, rc)
2703 if fs_is_mounted(self.path):
2704 panic("fs is still mounted:", self.path)
2709 self.mgmtcli.cleanup()
2711 def add_module(self, manager):
2712 manager.add_lustre_module('mdc', 'mdc')
2715 self.mgmtcli.add_module(manager)
2717 self.vosc.add_module(manager)
2718 self.vmdc.add_module(manager)
2720 manager.add_lustre_module('llite', 'llite')
2722 def correct_level(self, level, op=None):
2725 # ============================================================
2726 # misc query functions
2728 def get_ost_net(self, osd_uuid):
2732 osd = self.lookup(osd_uuid)
2733 node_uuid = osd.get_first_ref('node')
2734 node = self.lookup(node_uuid)
2736 panic("unable to find node for osd_uuid:", osd_uuid,
2737 " node_ref:", node_uuid_)
2738 for net_uuid in node.get_networks():
2739 db = node.lookup(net_uuid)
2740 srv_list.append(Network(db))
2744 # the order of iniitailization is based on level.
2745 def getServiceLevel(self):
2746 type = self.get_class()
2748 if type in ('network',):
2750 elif type in ('routetbl',):
2752 elif type in ('ldlm',):
2754 elif type in ('osd', 'cobd'):
2756 elif type in ('mdsdev',):
2758 elif type in ('lmv',):
2760 elif type in ('cmobd',):
2762 elif type in ('mountpoint', 'echoclient'):
2765 panic("Unknown type: ", type)
2767 if ret < config.minlevel or ret > config.maxlevel:
2772 # return list of services in a profile. list is a list of tuples
2773 # [(level, db_object),]
2774 def getServices(self):
2776 for ref_class, ref_uuid in self.get_all_refs():
2777 servdb = self.lookup(ref_uuid)
2779 level = getServiceLevel(servdb)
2781 list.append((level, servdb))
2783 panic('service not found: ' + ref_uuid)
2789 ############################################################
2791 # FIXME: clean this mess up!
2793 # OSC is no longer in the xml, so we have to fake it.
2794 # this is getting ugly and begging for another refactoring
2795 def get_osc(ost_db, uuid, fs_name):
2796 osc = OSC(ost_db, uuid, fs_name)
2799 def get_mdc(db, fs_name, mds_uuid):
2800 mds_db = db.lookup(mds_uuid);
2802 error("no mds:", mds_uuid)
2803 mdc = MDC(mds_db, mds_uuid, fs_name)
2806 ############################################################
2807 # routing ("rooting")
2809 # list of (nettype, cluster_id, nid)
2812 def find_local_clusters(node_db):
2813 global local_clusters
2814 for netuuid in node_db.get_networks():
2815 net = node_db.lookup(netuuid)
2817 debug("add_local", netuuid)
2818 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2820 if acceptors.has_key(srv.port):
2821 panic("duplicate port:", srv.port)
2822 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type)
2824 # This node is a gateway.
2826 def node_is_router():
2829 # If there are any routers found in the config, then this will be true
2830 # and all nodes will load kptlrouter.
2832 def node_needs_router():
2833 return needs_router or is_router
2835 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2836 # Currently, these local routes are only added to kptlrouter route
2837 # table if they are needed to connect to a specific server. This
2838 # should be changed so all available routes are loaded, and the
2839 # ptlrouter can make all the decisions.
2842 def find_local_routes(lustre):
2843 """ Scan the lustre config looking for routers . Build list of
2845 global local_routes, needs_router
2847 list = lustre.lookup_class('node')
2849 if router.get_val_int('router', 0):
2851 for (local_type, local_cluster_id, local_nid) in local_clusters:
2853 for netuuid in router.get_networks():
2854 db = router.lookup(netuuid)
2855 if (local_type == db.get_val('nettype') and
2856 local_cluster_id == db.get_val('clusterid')):
2857 gw = db.get_val('nid')
2860 debug("find_local_routes: gw is", gw)
2861 for route in router.get_local_routes(local_type, gw):
2862 local_routes.append(route)
2863 debug("find_local_routes:", local_routes)
2866 def choose_local_server(srv_list):
2867 for srv in srv_list:
2868 if local_cluster(srv.net_type, srv.cluster_id):
2871 def local_cluster(net_type, cluster_id):
2872 for cluster in local_clusters:
2873 if net_type == cluster[0] and cluster_id == cluster[1]:
2877 def local_interface(net_type, cluster_id, nid):
2878 for cluster in local_clusters:
2879 if (net_type == cluster[0] and cluster_id == cluster[1]
2880 and nid == cluster[2]):
2884 def find_route(srv_list):
2886 frm_type = local_clusters[0][0]
2887 for srv in srv_list:
2888 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2889 to_type = srv.net_type
2891 cluster_id = srv.cluster_id
2892 debug ('looking for route to', to_type, to)
2893 for r in local_routes:
2894 debug("find_route: ", r)
2895 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2896 result.append((srv, r))
2899 def get_active_target(db):
2900 target_uuid = db.getUUID()
2901 target_name = db.getName()
2902 node_name = get_select(target_name)
2904 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2906 tgt_dev_uuid = db.get_first_ref('active')
2909 def get_server_by_nid_uuid(db, nid_uuid):
2910 for n in db.lookup_class("network"):
2912 if net.nid_uuid == nid_uuid:
2916 ############################################################
2920 type = db.get_class()
2921 debug('Service:', type, db.getName(), db.getUUID())
2926 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2927 elif type == 'network':
2929 elif type == 'routetbl':
2933 elif type == 'cobd':
2934 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2935 elif type == 'cmobd':
2937 elif type == 'mdsdev':
2939 elif type == 'mountpoint':
2941 elif type == 'echoclient':
2946 panic ("unknown service type:", type)
2950 # Prepare the system to run lustre using a particular profile
2951 # in a the configuration.
2952 # * load & the modules
2953 # * setup networking for the current node
2954 # * make sure partitions are in place and prepared
2955 # * initialize devices with lctl
2956 # Levels is important, and needs to be enforced.
2957 def for_each_profile(db, prof_list, operation):
2958 for prof_uuid in prof_list:
2959 prof_db = db.lookup(prof_uuid)
2961 panic("profile:", prof_uuid, "not found.")
2962 services = getServices(prof_db)
2965 def magic_get_osc(db, rec, lov):
2967 lov_uuid = lov.get_uuid()
2968 lov_name = lov.osc.fs_name
2970 lov_uuid = rec.getAttribute('lov_uuidref')
2971 # FIXME: better way to find the mountpoint?
2972 filesystems = db.root_node.getElementsByTagName('filesystem')
2974 for fs in filesystems:
2975 ref = fs.getElementsByTagName('obd_ref')
2976 if ref[0].getAttribute('uuidref') == lov_uuid:
2977 fsuuid = fs.getAttribute('uuid')
2981 panic("malformed xml: lov uuid '" + lov_uuid + "' referenced in 'add' record is not used by any filesystems.")
2983 mtpts = db.root_node.getElementsByTagName('mountpoint')
2986 ref = fs.getElementsByTagName('filesystem_ref')
2987 if ref[0].getAttribute('uuidref') == fsuuid:
2988 lov_name = fs.getAttribute('name')
2992 panic("malformed xml: 'add' record references lov uuid '" + lov_uuid + "', which references filesystem uuid '" + fsuuid + "', which does not reference a mountpoint.")
2994 print "lov_uuid: " + lov_uuid + "; lov_name: " + lov_name
2996 ost_uuid = rec.getAttribute('ost_uuidref')
2997 obd = db.lookup(ost_uuid)
3000 panic("malformed xml: 'add' record references ost uuid '" + ost_uuid + "' which cannot be found.")
3002 osc = get_osc(obd, lov_uuid, lov_name)
3004 panic('osc not found:', obd_uuid)
3007 # write logs for update records. sadly, logs of all types -- and updates in
3008 # particular -- are something of an afterthought. lconf needs rewritten with
3009 # these as core concepts. so this is a pretty big hack.
3010 def process_update_record(db, update, lov):
3011 for rec in update.childNodes:
3012 if rec.nodeType != rec.ELEMENT_NODE:
3015 log("found "+rec.nodeName+" record in update version " +
3016 str(update.getAttribute('version')))
3018 lov_uuid = rec.getAttribute('lov_uuidref')
3019 ost_uuid = rec.getAttribute('ost_uuidref')
3020 index = rec.getAttribute('index')
3021 gen = rec.getAttribute('generation')
3023 if not lov_uuid or not ost_uuid or not index or not gen:
3024 panic("malformed xml: 'update' record requires lov_uuid, ost_uuid, index, and generation.")
3027 tmplov = db.lookup(lov_uuid)
3029 panic("malformed xml: 'delete' record contains lov UUID '" + lov_uuid + "', which cannot be located.")
3030 lov_name = tmplov.getName()
3032 lov_name = lov.osc.name
3034 # ------------------------------------------------------------- add
3035 if rec.nodeName == 'add':
3037 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
3040 osc = magic_get_osc(db, rec, lov)
3043 # Only ignore connect failures with --force, which
3044 # isn't implemented here yet.
3045 osc.prepare(ignore_connect_failure=0)
3046 except CommandError, e:
3047 print "Error preparing OSC %s\n" % osc.uuid
3050 lctl.lov_add_obd(lov_name, lov_uuid, ost_uuid, index, gen)
3052 # ------------------------------------------------------ deactivate
3053 elif rec.nodeName == 'deactivate':
3057 osc = magic_get_osc(db, rec, lov)
3061 except CommandError, e:
3062 print "Error deactivating OSC %s\n" % osc.uuid
3065 # ---------------------------------------------------------- delete
3066 elif rec.nodeName == 'delete':
3070 osc = magic_get_osc(db, rec, lov)
3076 except CommandError, e:
3077 print "Error cleaning up OSC %s\n" % osc.uuid
3080 lctl.lov_del_obd(lov_name, lov_uuid, ost_uuid, index, gen)
3082 def process_updates(db, log_device, log_name, lov = None):
3083 updates = db.root_node.getElementsByTagName('update')
3085 if not u.childNodes:
3086 log("ignoring empty update record (version " +
3087 str(u.getAttribute('version')) + ")")
3090 version = u.getAttribute('version')
3091 real_name = "%s-%s" % (log_name, version)
3092 lctl.clear_log(log_device, real_name)
3093 lctl.record(log_device, real_name)
3095 process_update_record(db, u, lov)
3099 def doWriteconf(services):
3103 if s[1].get_class() == 'mdsdev':
3104 n = newService(s[1])
3107 def doSetup(services):
3112 n = newService(s[1])
3114 slist.append((n.level, n))
3117 nl = n[1].correct_level(n[0])
3118 nlist.append((nl, n[1]))
3123 def doLoadModules(services):
3127 # adding all needed modules from all services
3129 n = newService(s[1])
3130 n.add_module(mod_manager)
3132 # loading all registered modules
3133 mod_manager.load_modules()
3135 def doUnloadModules(services):
3139 # adding all needed modules from all services
3141 n = newService(s[1])
3142 if n.safe_to_clean_modules():
3143 n.add_module(mod_manager)
3145 # unloading all registered modules
3146 mod_manager.cleanup_modules()
3148 def doCleanup(services):
3154 n = newService(s[1])
3156 slist.append((n.level, n))
3159 nl = n[1].correct_level(n[0])
3160 nlist.append((nl, n[1]))
3165 if n[1].safe_to_clean():
3170 def doHost(lustreDB, hosts):
3171 global is_router, local_node_name
3174 node_db = lustreDB.lookup_name(h, 'node')
3178 panic('No host entry found.')
3180 local_node_name = node_db.get_val('name', 0)
3181 is_router = node_db.get_val_int('router', 0)
3182 lustre_upcall = node_db.get_val('lustreUpcall', '')
3183 portals_upcall = node_db.get_val('portalsUpcall', '')
3184 timeout = node_db.get_val_int('timeout', 0)
3185 ptldebug = node_db.get_val('ptldebug', '')
3186 subsystem = node_db.get_val('subsystem', '')
3188 find_local_clusters(node_db)
3190 find_local_routes(lustreDB)
3192 # Two step process: (1) load modules, (2) setup lustre
3193 # if not cleaning, load modules first.
3194 prof_list = node_db.get_refs('profile')
3196 if config.write_conf:
3197 for_each_profile(node_db, prof_list, doLoadModules)
3199 for_each_profile(node_db, prof_list, doWriteconf)
3200 for_each_profile(node_db, prof_list, doUnloadModules)
3203 elif config.recover:
3204 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
3205 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
3206 "--client_uuid <UUID> --conn_uuid <UUID>")
3207 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
3209 elif config.cleanup:
3211 # the command line can override this value
3213 # ugly hack, only need to run lctl commands for --dump
3214 if config.lctl_dump or config.record:
3215 for_each_profile(node_db, prof_list, doCleanup)
3218 sys_set_timeout(timeout)
3219 sys_set_ptldebug(ptldebug)
3220 sys_set_subsystem(subsystem)
3221 sys_set_lustre_upcall(lustre_upcall)
3222 sys_set_portals_upcall(portals_upcall)
3224 for_each_profile(node_db, prof_list, doCleanup)
3225 for_each_profile(node_db, prof_list, doUnloadModules)
3229 # ugly hack, only need to run lctl commands for --dump
3230 if config.lctl_dump or config.record:
3231 sys_set_timeout(timeout)
3232 sys_set_lustre_upcall(lustre_upcall)
3233 for_each_profile(node_db, prof_list, doSetup)
3237 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
3238 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
3240 for_each_profile(node_db, prof_list, doLoadModules)
3242 sys_set_debug_path()
3243 sys_set_ptldebug(ptldebug)
3244 sys_set_subsystem(subsystem)
3245 script = config.gdb_script
3246 run(lctl.lctl, ' modules >', script)
3248 log ("The GDB module script is in", script)
3249 # pause, so user has time to break and
3252 sys_set_timeout(timeout)
3253 sys_set_lustre_upcall(lustre_upcall)
3254 sys_set_portals_upcall(portals_upcall)
3256 for_each_profile(node_db, prof_list, doSetup)
3259 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
3260 tgt = lustreDB.lookup(tgt_uuid)
3262 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
3263 new_uuid = get_active_target(tgt)
3265 raise Lustre.LconfError("doRecovery: no active target found for: " +
3267 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
3269 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
3271 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
3273 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
3276 lctl.disconnect(oldnet)
3277 except CommandError, e:
3278 log("recover: disconnect", nid_uuid, "failed: ")
3283 except CommandError, e:
3284 log("recover: connect failed")
3287 lctl.recover(client_uuid, net.nid_uuid)
3290 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
3291 base = os.path.dirname(cmd)
3292 if development_mode():
3293 if not config.lustre:
3294 debug('using objdir module paths')
3295 config.lustre = (os.path.join(base, ".."))
3296 # normalize the portals dir, using command line arg if set
3298 portals_dir = config.portals
3299 dir = os.path.join(config.lustre, portals_dir)
3300 config.portals = dir
3301 debug('config.portals', config.portals)
3302 elif config.lustre and config.portals:
3304 # if --lustre and --portals, normalize portals
3305 # can ignore POTRALS_DIR here, since it is probly useless here
3306 config.portals = os.path.join(config.lustre, config.portals)
3307 debug('config.portals B', config.portals)
3309 def sysctl(path, val):
3310 debug("+ sysctl", path, val)
3314 fp = open(os.path.join('/proc/sys', path), 'w')
3321 def sys_set_debug_path():
3322 sysctl('portals/debug_path', config.debug_path)
3324 def sys_set_lustre_upcall(upcall):
3325 # the command overrides the value in the node config
3326 if config.lustre_upcall:
3327 upcall = config.lustre_upcall
3329 upcall = config.upcall
3331 lctl.set_lustre_upcall(upcall)
3333 def sys_set_portals_upcall(upcall):
3334 # the command overrides the value in the node config
3335 if config.portals_upcall:
3336 upcall = config.portals_upcall
3338 upcall = config.upcall
3340 sysctl('portals/upcall', upcall)
3342 def sys_set_timeout(timeout):
3343 # the command overrides the value in the node config
3344 if config.timeout and config.timeout > 0:
3345 timeout = config.timeout
3346 if timeout != None and timeout > 0:
3347 lctl.set_timeout(timeout)
3349 def sys_tweak_socknal ():
3350 # reserve at least 8MB, or we run out of RAM in skb_alloc under read
3351 if sys_get_branch() == '2.6':
3352 fp = open('/proc/meminfo')
3353 lines = fp.readlines()
3358 if a[0] == 'MemTotal:':
3360 debug("memtotal" + memtotal)
3361 if int(memtotal) < 262144:
3362 minfree = int(memtotal) / 16
3365 debug("+ minfree ", minfree)
3366 sysctl("vm/min_free_kbytes", minfree)
3367 if config.single_socket:
3368 sysctl("socknal/typed", 0)
3370 def sys_optimize_elan ():
3371 procfiles = ["/proc/elan/config/eventint_punt_loops",
3372 "/proc/qsnet/elan3/config/eventint_punt_loops",
3373 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
3375 if os.access(p, os.W_OK):
3376 run ("echo 1 > " + p)
3378 def sys_set_ptldebug(ptldebug):
3380 ptldebug = config.ptldebug
3383 val = eval(ptldebug, ptldebug_names)
3384 val = "0x%x" % (val)
3385 sysctl('portals/debug', val)
3386 except NameError, e:
3389 def sys_set_subsystem(subsystem):
3390 if config.subsystem:
3391 subsystem = config.subsystem
3394 val = eval(subsystem, subsystem_names)
3395 val = "0x%x" % (val)
3396 sysctl('portals/subsystem_debug', val)
3397 except NameError, e:
3400 def sys_set_netmem_max(path, max):
3401 debug("setting", path, "to at least", max)
3409 fp = open(path, 'w')
3410 fp.write('%d\n' %(max))
3414 def sys_make_devices():
3415 if not os.access('/dev/portals', os.R_OK):
3416 run('mknod /dev/portals c 10 240')
3417 if not os.access('/dev/obd', os.R_OK):
3418 run('mknod /dev/obd c 10 241')
3421 # Add dir to the global PATH, if not already there.
3422 def add_to_path(new_dir):
3423 syspath = string.split(os.environ['PATH'], ':')
3424 if new_dir in syspath:
3426 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
3428 def default_debug_path():
3429 path = '/tmp/lustre-log'
3430 if os.path.isdir('/r'):
3435 def default_gdb_script():
3436 script = '/tmp/ogdb'
3437 if os.path.isdir('/r'):
3438 return '/r' + script
3443 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
3444 # ensure basic elements are in the system path
3445 def sanitise_path():
3446 for dir in DEFAULT_PATH:
3449 # global hack for the --select handling
3451 def init_select(args):
3452 # args = [service=nodeA,service2=nodeB service3=nodeC]
3455 list = string.split(arg, ',')
3457 srv, node = string.split(entry, '=')
3458 tgt_select[srv] = node
3460 def get_select(srv):
3461 if tgt_select.has_key(srv):
3462 return tgt_select[srv]
3466 FLAG = Lustre.Options.FLAG
3467 PARAM = Lustre.Options.PARAM
3468 INTPARAM = Lustre.Options.INTPARAM
3469 PARAMLIST = Lustre.Options.PARAMLIST
3471 ('verbose,v', "Print system commands as they are run"),
3472 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
3473 ('config', "Cluster config name used for LDAP query", PARAM),
3474 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
3475 ('node', "Load config for <nodename>", PARAM),
3476 ('cleanup,d', "Cleans up config. (Shutdown)"),
3477 ('force,f', "Forced unmounting and/or obd detach during cleanup",
3479 ('single_socket', "socknal option: only use one socket instead of bundle",
3481 ('failover',"""Used to shut down without saving state.
3482 This will allow this node to "give up" a service to a
3483 another node for failover purposes. This will not
3484 be a clean shutdown.""",
3486 ('gdb', """Prints message after creating gdb module script
3487 and sleeps for 5 seconds."""),
3488 ('noexec,n', """Prints the commands and steps that will be run for a
3489 config without executing them. This can used to check if a
3490 config file is doing what it should be doing"""),
3491 ('nomod', "Skip load/unload module step."),
3492 ('nosetup', "Skip device setup/cleanup step."),
3493 ('reformat', "Reformat all devices (without question)"),
3494 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
3495 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
3496 ('clientoptions', "Additional options for Lustre", PARAM),
3497 ('dump', "Dump the kernel debug log to file before portals is unloaded",
3499 ('write_conf', "Save all the client config information on mds."),
3500 ('record', "Write config information on mds."),
3501 ('record_log', "Name of config record log.", PARAM),
3502 ('record_device', "MDS device name that will record the config commands",
3504 ('root_squash', "MDS squash root to appointed uid",
3506 ('no_root_squash', "Don't squash root for appointed nid",
3508 ('minlevel', "Minimum level of services to configure/cleanup",
3510 ('maxlevel', """Maximum level of services to configure/cleanup
3511 Levels are aproximatly like:
3516 70 - mountpoint, echo_client, osc, mdc, lov""",
3518 ('lustre', """Base directory of lustre sources. This parameter will
3519 cause lconf to load modules from a source tree.""", PARAM),
3520 ('portals', """Portals source directory. If this is a relative path,
3521 then it is assumed to be relative to lustre. """, PARAM),
3522 ('timeout', "Set recovery timeout", INTPARAM),
3523 ('upcall', "Set both portals and lustre upcall script", PARAM),
3524 ('lustre_upcall', "Set lustre upcall script", PARAM),
3525 ('portals_upcall', "Set portals upcall script", PARAM),
3526 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3527 ('ptldebug', "Set the portals debug level", PARAM),
3528 ('subsystem', "Set the portals debug subsystem", PARAM),
3529 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3530 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3531 # Client recovery options
3532 ('recover', "Recover a device"),
3533 ('group', "The group of devices to configure or cleanup", PARAM),
3534 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3535 ('client_uuid', "The failed client (required for recovery)", PARAM),
3536 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3538 ('inactive', """The name of an inactive service, to be ignored during
3539 mounting (currently OST-only). Can be repeated.""",
3544 global lctl, config, toplustreDB, CONFIG_FILE, mod_manager
3546 # in the upcall this is set to SIG_IGN
3547 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3549 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3551 config, args = cl.parse(sys.argv[1:])
3552 except Lustre.OptionError, e:
3556 setupModulePath(sys.argv[0])
3558 host = socket.gethostname()
3560 # the PRNG is normally seeded with time(), which is not so good for starting
3561 # time-synchronized clusters
3562 input = open('/dev/urandom', 'r')
3564 print 'Unable to open /dev/urandom!'
3566 seed = input.read(32)
3572 init_select(config.select)
3575 # allow config to be fetched via HTTP, but only with python2
3576 if sys.version[0] != '1' and args[0].startswith('http://'):
3579 config_file = urllib2.urlopen(args[0])
3580 except (urllib2.URLError, socket.error), err:
3581 if hasattr(err, 'args'):
3583 print "Could not access '%s': %s" %(args[0], err)
3585 elif not os.access(args[0], os.R_OK):
3586 print 'File not found or readable:', args[0]
3590 config_file = open(args[0], 'r')
3592 dom = xml.dom.minidom.parse(config_file)
3594 panic("%s does not appear to be a config file." % (args[0]))
3595 sys.exit(1) # make sure to die here, even in debug mode.
3597 CONFIG_FILE = args[0]
3598 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3599 if not config.config:
3600 config.config = os.path.basename(args[0])# use full path?
3601 if config.config[-4:] == '.xml':
3602 config.config = config.config[:-4]
3603 elif config.ldapurl:
3604 if not config.config:
3605 panic("--ldapurl requires --config name")
3606 dn = "config=%s,fs=lustre" % (config.config)
3607 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3608 elif config.ptldebug or config.subsystem:
3609 sys_set_ptldebug(None)
3610 sys_set_subsystem(None)
3613 print 'Missing config file or ldap URL.'
3614 print 'see lconf --help for command summary'
3617 toplustreDB = lustreDB
3619 ver = lustreDB.get_version()
3621 panic("No version found in config data, please recreate.")
3622 if ver != Lustre.CONFIG_VERSION:
3623 panic("Config version", ver, "does not match lconf version",
3624 Lustre.CONFIG_VERSION)
3628 node_list.append(config.node)
3631 node_list.append(host)
3632 node_list.append('localhost')
3634 debug("configuring for host: ", node_list)
3637 config.debug_path = config.debug_path + '-' + host
3638 config.gdb_script = config.gdb_script + '-' + host
3640 lctl = LCTLInterface('lctl')
3642 if config.lctl_dump:
3643 lctl.use_save_file(config.lctl_dump)
3646 if not (config.record_device and config.record_log):
3647 panic("When recording, both --record_log and --record_device must be specified.")
3648 lctl.clear_log(config.record_device, config.record_log)
3649 lctl.record(config.record_device, config.record_log)
3651 # init module manager
3652 mod_manager = kmod_manager(config.lustre, config.portals)
3654 doHost(lustreDB, node_list)
3656 if not config.record:
3661 process_updates(lustreDB, config.record_device, config.record_log)
3663 if __name__ == "__main__":
3666 except Lustre.LconfError, e:
3668 # traceback.print_exc(file=sys.stdout)
3670 except CommandError, e:
3674 if first_cleanup_error:
3675 sys.exit(first_cleanup_error)