3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
359 # print "LCTL:", cmds
361 # From "Python Cookbook" from O'Reilly
362 outfile = child.fromchild
363 outfd = outfile.fileno()
364 self.set_nonblock(outfd)
365 errfile = child.childerr
366 errfd = errfile.fileno()
367 self.set_nonblock(errfd)
369 outdata = errdata = ''
372 ready = select.select([outfd,errfd],[],[]) # Wait for input
373 if outfd in ready[0]:
374 outchunk = outfile.read()
375 if outchunk == '': outeof = 1
376 outdata = outdata + outchunk
377 if errfd in ready[0]:
378 errchunk = errfile.read()
379 if errchunk == '': erreof = 1
380 errdata = errdata + errchunk
381 if outeof and erreof: break
382 # end of "borrowed" code
385 if os.WIFEXITED(ret):
386 rc = os.WEXITSTATUS(ret)
389 if rc or len(errdata):
390 raise CommandError(self.lctl, errdata, rc)
393 def runcmd(self, *args):
395 run lctl using the command line
397 cmd = string.join(map(str,args))
398 debug("+", self.lctl, cmd)
399 rc, out = run(self.lctl, cmd)
401 raise CommandError(self.lctl, out, rc)
405 def clear_log(self, dev, log):
406 """ clear an existing log """
411 quit """ % (dev, log)
414 def network(self, net, nid):
419 quit """ % (net, nid)
422 # create a new connection
423 def add_uuid(self, net_type, uuid, nid):
424 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
427 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
429 if net_type in ('tcp',) and not config.lctl_dump:
434 add_autoconn %s %s %d %s
438 nid, hostaddr, port, flags )
441 def connect(self, srv):
442 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
443 if srv.net_type in ('tcp',) and not config.lctl_dump:
447 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
448 srv.nid, srv.hostaddr, srv.port, flags)
451 def recover(self, dev_name, new_conn):
454 recover %s""" %(dev_name, new_conn)
457 # add a route to a range
458 def add_route(self, net, gw, lo, hi):
466 except CommandError, e:
470 def del_route(self, net, gw, lo, hi):
475 quit """ % (net, gw, lo, hi)
478 # add a route to a host
479 def add_route_host(self, net, uuid, gw, tgt):
480 self.add_uuid(net, uuid, tgt)
488 except CommandError, e:
492 # add a route to a range
493 def del_route_host(self, net, uuid, gw, tgt):
499 quit """ % (net, gw, tgt)
503 def del_autoconn(self, net_type, nid, hostaddr):
504 if net_type in ('tcp',) and not config.lctl_dump:
513 # disconnect one connection
514 def disconnect(self, srv):
515 self.del_uuid(srv.nid_uuid)
516 if srv.net_type in ('tcp',) and not config.lctl_dump:
517 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
519 def del_uuid(self, uuid):
527 def disconnectAll(self, net):
535 def attach(self, type, name, uuid):
538 quit""" % (type, name, uuid)
541 def setup(self, name, setup = ""):
545 quit""" % (name, setup)
549 # create a new device with lctl
550 def newdev(self, type, name, uuid, setup = ""):
551 self.attach(type, name, uuid);
553 self.setup(name, setup)
554 except CommandError, e:
555 self.cleanup(name, uuid, 0)
560 def cleanup(self, name, uuid, force, failover = 0):
561 if failover: force = 1
567 quit""" % (name, ('', 'force')[force],
568 ('', 'failover')[failover])
572 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
573 stripe_sz, stripe_off,
577 lov_setup %s %d %d %d %s %s
578 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lmv_setup(self, name, uuid, desc_uuid, devlist):
586 quit""" % (name, uuid, desc_uuid, devlist)
589 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
593 lov_setconfig %s %d %d %d %s %s
594 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
598 def dump(self, dump_file):
601 quit""" % (dump_file)
604 # get list of devices
605 def device_list(self):
606 devices = '/proc/fs/lustre/devices'
608 if os.access(devices, os.R_OK):
610 fp = open(devices, 'r')
618 def lustre_version(self):
619 rc, out = self.runcmd('version')
623 def mount_option(self, profile, osc, mdc):
625 mount_option %s %s %s
626 quit""" % (profile, osc, mdc)
629 # delete mount options
630 def del_mount_option(self, profile):
636 def set_timeout(self, timeout):
642 # delete mount options
643 def set_lustre_upcall(self, upcall):
648 # ============================================================
649 # Various system-level functions
650 # (ideally moved to their own module)
652 # Run a command and return the output and status.
653 # stderr is sent to /dev/null, could use popen3 to
654 # save it if necessary
657 if config.noexec: return (0, [])
658 f = os.popen(cmd + ' 2>&1')
668 cmd = string.join(map(str,args))
671 # Run a command in the background.
672 def run_daemon(*args):
673 cmd = string.join(map(str,args))
675 if config.noexec: return 0
676 f = os.popen(cmd + ' 2>&1')
684 # Determine full path to use for an external command
685 # searches dirname(argv[0]) first, then PATH
687 syspath = string.split(os.environ['PATH'], ':')
688 cmdpath = os.path.dirname(sys.argv[0])
689 syspath.insert(0, cmdpath);
691 syspath.insert(0, os.path.join(config.portals, 'utils/'))
693 prog = os.path.join(d,cmd)
694 if os.access(prog, os.X_OK):
698 # Recursively look for file starting at base dir
699 def do_find_file(base, mod):
700 fullname = os.path.join(base, mod)
701 if os.access(fullname, os.R_OK):
703 for d in os.listdir(base):
704 dir = os.path.join(base,d)
705 if os.path.isdir(dir):
706 module = do_find_file(dir, mod)
710 def find_module(src_dir, dev_dir, modname):
711 modbase = src_dir +'/'+ dev_dir +'/'+ modname
712 for modext in '.ko', '.o':
713 module = modbase + modext
715 if os.access(module, os.R_OK):
721 # is the path a block device?
728 return stat.S_ISBLK(s[stat.ST_MODE])
730 # build fs according to type
732 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
738 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
740 # devsize is in 1k, and fs block count is in 4k
741 block_cnt = devsize/4
743 if fstype in ('ext3', 'extN', 'ldiskfs'):
744 # ext3 journal size is in megabytes
747 if not is_block(dev):
748 ret, out = runcmd("ls -l %s" %dev)
749 devsize = int(string.split(out[0])[4]) / 1024
751 ret, out = runcmd("sfdisk -s %s" %dev)
752 devsize = int(out[0])
753 if devsize > 1024 * 1024:
754 jsize = ((devsize / 102400) * 4)
757 if jsize: jopt = "-J size=%d" %(jsize,)
758 if isize: iopt = "-I %d" %(isize,)
759 mkfs = 'mkfs.ext2 -j -b 4096 '
760 if not isblock or config.force:
762 elif fstype == 'reiserfs':
763 # reiserfs journal size is in blocks
764 if jsize: jopt = "--journal_size %d" %(jsize,)
765 mkfs = 'mkreiserfs -ff'
767 panic('unsupported fs type: ', fstype)
769 if config.mkfsoptions != None:
770 mkfs = mkfs + ' ' + config.mkfsoptions
771 if mkfsoptions != None:
772 mkfs = mkfs + ' ' + mkfsoptions
773 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
775 panic("Unable to build fs:", dev, string.join(out))
776 # enable hash tree indexing on fsswe
777 if fstype in ('ext3', 'extN', 'ldiskfs'):
778 htree = 'echo "feature FEATURE_C5" | debugfs -w'
779 (ret, out) = run (htree, dev)
781 panic("Unable to enable htree:", dev)
783 # some systems use /dev/loopN, some /dev/loop/N
787 if not os.access(loop + str(0), os.R_OK):
789 if not os.access(loop + str(0), os.R_OK):
790 panic ("can't access loop devices")
793 # find loop device assigned to the file
794 def find_assigned_loop(file):
796 for n in xrange(0, MAX_LOOP_DEVICES):
798 if os.access(dev, os.R_OK):
799 (stat, out) = run('losetup', dev)
800 if out and stat == 0:
801 m = re.search(r'\((.*)\)', out[0])
802 if m and file == m.group(1):
808 # create file if necessary and assign the first free loop device
809 def init_loop(file, size, fstype, journal_size, inode_size,
810 mkfsoptions, reformat, autoformat, backfstype, backfile):
813 realfstype = backfstype
814 if is_block(backfile):
815 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
816 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
822 dev = find_assigned_loop(realfile)
824 print 'WARNING file:', realfile, 'already mapped to', dev
827 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
829 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
830 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
832 panic("Unable to create backing store:", realfile)
834 mkfs(realfile, size, realfstype, journal_size, inode_size,
835 mkfsoptions, isblock=0)
838 # find next free loop
839 for n in xrange(0, MAX_LOOP_DEVICES):
841 if os.access(dev, os.R_OK):
842 (stat, out) = run('losetup', dev)
844 run('losetup', dev, realfile)
847 print "out of loop devices"
849 print "out of loop devices"
852 # undo loop assignment
853 def clean_loop(file):
854 dev = find_assigned_loop(file)
856 ret, out = run('losetup -d', dev)
858 log('unable to clean loop device:', dev, 'for file:', file)
861 # determine if dev is formatted as a <fstype> filesystem
862 def need_format(fstype, dev):
863 # FIXME don't know how to implement this
866 # initialize a block device if needed
867 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
868 inode_size, mkfsoptions, backfstype, backdev):
872 if fstype == 'smfs' or not is_block(dev):
873 dev = init_loop(dev, size, fstype, journal_size, inode_size,
874 mkfsoptions, reformat, autoformat, backfstype, backdev)
875 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
876 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
879 # panic("device:", dev,
880 # "not prepared, and autoformat is not set.\n",
881 # "Rerun with --reformat option to format ALL filesystems")
886 """lookup IP address for an interface"""
887 rc, out = run("/sbin/ifconfig", iface)
890 addr = string.split(out[1])[1]
891 ip = string.split(addr, ':')[1]
894 def def_mount_options(fstype, target):
895 """returns deafult mount options for passed fstype and target (mds, ost)"""
896 if fstype == 'ext3' or fstype == 'ldiskfs':
897 mountfsoptions = "errors=remount-ro"
898 if target == 'ost' and sys_get_branch() == '2.4':
899 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
900 return mountfsoptions
903 def sys_get_elan_position_file():
904 procfiles = ["/proc/elan/device0/position",
905 "/proc/qsnet/elan4/device0/position",
906 "/proc/qsnet/elan3/device0/position"]
908 if os.access(p, os.R_OK):
912 def sys_get_local_nid(net_type, wildcard, cluster_id):
913 """Return the local nid."""
915 if sys_get_elan_position_file():
916 local = sys_get_local_address('elan', '*', cluster_id)
918 local = sys_get_local_address(net_type, wildcard, cluster_id)
921 def sys_get_local_address(net_type, wildcard, cluster_id):
922 """Return the local address for the network type."""
924 if net_type in ('tcp',):
926 iface, star = string.split(wildcard, ':')
927 local = if2addr(iface)
929 panic ("unable to determine ip for:", wildcard)
931 host = socket.gethostname()
932 local = socket.gethostbyname(host)
933 elif net_type == 'elan':
934 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
935 f = sys_get_elan_position_file()
937 panic ("unable to determine local Elan ID")
940 lines = fp.readlines()
948 nid = my_int(cluster_id) + my_int(elan_id)
950 except ValueError, e:
954 elif net_type == 'gm':
955 fixme("automatic local address for GM")
959 def sys_get_branch():
960 """Returns kernel release"""
962 fp = open('/proc/sys/kernel/osrelease')
963 lines = fp.readlines()
967 version = string.split(l)
968 a = string.split(version[0], '.')
969 return a[0] + '.' + a[1]
975 def mod_loaded(modname):
976 """Check if a module is already loaded. Look in /proc/modules for it."""
978 fp = open('/proc/modules')
979 lines = fp.readlines()
981 # please forgive my tired fingers for this one
982 ret = filter(lambda word, mod=modname: word == mod,
983 map(lambda line: string.split(line)[0], lines))
988 # XXX: instead of device_list, ask for $name and see what we get
989 def is_prepared(name):
990 """Return true if a device exists for the name"""
993 if (config.noexec or config.record) and config.cleanup:
996 # expect this format:
997 # 1 UP ldlm ldlm ldlm_UUID 2
998 out = lctl.device_list()
1000 if name == string.split(s)[3]:
1002 except CommandError, e:
1006 def is_network_prepared():
1007 """If the any device exists, then assume that all networking
1008 has been configured"""
1009 out = lctl.device_list()
1012 def fs_is_mounted(path):
1013 """Return true if path is a mounted lustre filesystem"""
1015 fp = open('/proc/mounts')
1016 lines = fp.readlines()
1020 if a[1] == path and a[2] == 'lustre_lite':
1028 """Manage kernel modules"""
1029 def __init__(self, lustre_dir, portals_dir):
1030 self.lustre_dir = lustre_dir
1031 self.portals_dir = portals_dir
1032 self.kmodule_list = []
1034 def add_portals_module(self, dev_dir, modname):
1035 """Append a module to list of modules to load."""
1036 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1038 def add_lustre_module(self, dev_dir, modname):
1039 """Append a module to list of modules to load."""
1040 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1042 def load_module(self):
1043 """Load all the modules in the list in the order they appear."""
1044 for src_dir, dev_dir, mod in self.kmodule_list:
1045 if mod_loaded(mod) and not config.noexec:
1047 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1049 module = find_module(src_dir, dev_dir, mod)
1051 panic('module not found:', mod)
1052 (rc, out) = run('/sbin/insmod', module)
1054 raise CommandError('insmod', out, rc)
1056 (rc, out) = run('/sbin/modprobe', mod)
1058 raise CommandError('modprobe', out, rc)
1060 def cleanup_module(self):
1061 """Unload the modules in the list in reverse order."""
1062 rev = self.kmodule_list
1064 for src_dir, dev_dir, mod in rev:
1065 if not mod_loaded(mod) and not config.noexec:
1068 if mod == 'portals' and config.dump:
1069 lctl.dump(config.dump)
1070 log('unloading module:', mod)
1071 (rc, out) = run('/sbin/rmmod', mod)
1073 log('! unable to unload module:', mod)
1076 # ============================================================
1077 # Classes to prepare and cleanup the various objects
1080 """ Base class for the rest of the modules. The default cleanup method is
1081 defined here, as well as some utilitiy funcs.
1083 def __init__(self, module_name, db):
1085 self.module_name = module_name
1086 self.name = self.db.getName()
1087 self.uuid = self.db.getUUID()
1090 self.kmod = kmod(config.lustre, config.portals)
1092 def info(self, *args):
1093 msg = string.join(map(str,args))
1094 print self.module_name + ":", self.name, self.uuid, msg
1097 """ default cleanup, used for most modules """
1100 lctl.cleanup(self.name, self.uuid, config.force)
1101 except CommandError, e:
1102 log(self.module_name, "cleanup failed: ", self.name)
1106 def add_portals_module(self, dev_dir, modname):
1107 """Append a module to list of modules to load."""
1108 self.kmod.add_portals_module(dev_dir, modname)
1110 def add_lustre_module(self, dev_dir, modname):
1111 """Append a module to list of modules to load."""
1112 self.kmod.add_lustre_module(dev_dir, modname)
1114 def load_module(self):
1115 """Load all the modules in the list in the order they appear."""
1116 self.kmod.load_module()
1118 def cleanup_module(self):
1119 """Unload the modules in the list in reverse order."""
1120 if self.safe_to_clean():
1121 self.kmod.cleanup_module()
1123 def safe_to_clean(self):
1126 def safe_to_clean_modules(self):
1127 return self.safe_to_clean()
1129 class Network(Module):
1130 def __init__(self,db):
1131 Module.__init__(self, 'NETWORK', db)
1132 self.net_type = self.db.get_val('nettype')
1133 self.nid = self.db.get_val('nid', '*')
1134 self.cluster_id = self.db.get_val('clusterid', "0")
1135 self.port = self.db.get_val_int('port', 0)
1136 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1137 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1138 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1141 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1143 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1144 self.generic_nid = 1
1145 debug("nid:", self.nid)
1147 self.generic_nid = 0
1149 self.nid_uuid = self.nid_to_uuid(self.nid)
1151 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1152 if '*' in self.hostaddr:
1153 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1154 if not self.hostaddr:
1155 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1156 debug("hostaddr:", self.hostaddr)
1158 self.add_portals_module("libcfs", 'libcfs')
1159 self.add_portals_module("portals", 'portals')
1160 if node_needs_router():
1161 self.add_portals_module("router", 'kptlrouter')
1162 if self.net_type == 'tcp':
1163 self.add_portals_module("knals/socknal", 'ksocknal')
1164 if self.net_type == 'elan':
1165 self.add_portals_module("knals/qswnal", 'kqswnal')
1166 if self.net_type == 'gm':
1167 self.add_portals_module("knals/gmnal", 'kgmnal')
1169 def nid_to_uuid(self, nid):
1170 return "NID_%s_UUID" %(nid,)
1173 if is_network_prepared():
1175 self.info(self.net_type, self.nid, self.port)
1176 if not (config.record and self.generic_nid):
1177 lctl.network(self.net_type, self.nid)
1178 if self.net_type == 'tcp':
1180 if self.net_type == 'elan':
1182 if self.port and node_is_router():
1183 run_one_acceptor(self.port)
1184 self.connect_peer_gateways()
1186 def connect_peer_gateways(self):
1187 for router in self.db.lookup_class('node'):
1188 if router.get_val_int('router', 0):
1189 for netuuid in router.get_networks():
1190 net = self.db.lookup(netuuid)
1192 if (gw.cluster_id == self.cluster_id and
1193 gw.net_type == self.net_type):
1194 if gw.nid != self.nid:
1197 def disconnect_peer_gateways(self):
1198 for router in self.db.lookup_class('node'):
1199 if router.get_val_int('router', 0):
1200 for netuuid in router.get_networks():
1201 net = self.db.lookup(netuuid)
1203 if (gw.cluster_id == self.cluster_id and
1204 gw.net_type == self.net_type):
1205 if gw.nid != self.nid:
1208 except CommandError, e:
1209 print "disconnect failed: ", self.name
1213 def safe_to_clean(self):
1214 return not is_network_prepared()
1217 self.info(self.net_type, self.nid, self.port)
1219 stop_acceptor(self.port)
1220 if node_is_router():
1221 self.disconnect_peer_gateways()
1223 def correct_level(self, level, op=None):
1226 class RouteTable(Module):
1227 def __init__(self,db):
1228 Module.__init__(self, 'ROUTES', db)
1230 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1232 # only setup connections for tcp NALs
1234 if not net_type in ('tcp',):
1237 # connect to target if route is to single node and this node is the gw
1238 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1239 if not local_cluster(net_type, tgt_cluster_id):
1240 panic("target", lo, " not on the local cluster")
1241 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1242 # connect to gateway if this node is not the gw
1243 elif (local_cluster(net_type, gw_cluster_id)
1244 and not local_interface(net_type, gw_cluster_id, gw)):
1245 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1250 panic("no server for nid", lo)
1253 return Network(srvdb)
1256 if is_network_prepared():
1259 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1260 lctl.add_route(net_type, gw, lo, hi)
1261 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1265 def safe_to_clean(self):
1266 return not is_network_prepared()
1269 if is_network_prepared():
1270 # the network is still being used, don't clean it up
1272 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1273 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1276 lctl.disconnect(srv)
1277 except CommandError, e:
1278 print "disconnect failed: ", self.name
1283 lctl.del_route(net_type, gw, lo, hi)
1284 except CommandError, e:
1285 print "del_route failed: ", self.name
1289 class Management(Module):
1290 def __init__(self, db):
1291 Module.__init__(self, 'MGMT', db)
1292 self.add_lustre_module('lvfs', 'lvfs')
1293 self.add_lustre_module('obdclass', 'obdclass')
1294 self.add_lustre_module('ptlrpc', 'ptlrpc')
1295 self.add_lustre_module('mgmt', 'mgmt_svc')
1298 if is_prepared(self.name):
1301 lctl.newdev("mgmt", self.name, self.uuid)
1303 def safe_to_clean(self):
1307 if is_prepared(self.name):
1308 Module.cleanup(self)
1310 def correct_level(self, level, op=None):
1313 # This is only needed to load the modules; the LDLM device
1314 # is now created automatically.
1316 def __init__(self,db):
1317 Module.__init__(self, 'LDLM', db)
1318 self.add_lustre_module('lvfs', 'lvfs')
1319 self.add_lustre_module('obdclass', 'obdclass')
1320 self.add_lustre_module('ptlrpc', 'ptlrpc')
1328 def correct_level(self, level, op=None):
1333 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1334 Module.__init__(self, 'LOV', db)
1335 if name_override != None:
1336 self.name = "lov_%s" % name_override
1337 self.add_lustre_module('lov', 'lov')
1338 self.mds_uuid = self.db.get_first_ref('mds')
1339 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1340 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1341 self.pattern = self.db.get_val_int('stripepattern', 0)
1342 self.devlist = self.db.get_refs('obd')
1343 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1345 self.desc_uuid = self.uuid
1346 self.uuid = generate_client_uuid(self.name)
1347 self.fs_name = fs_name
1349 self.config_only = 1
1351 self.config_only = None
1352 mds= self.db.lookup(self.mds_uuid)
1353 self.mds_name = mds.getName()
1354 for obd_uuid in self.devlist:
1355 obd = self.db.lookup(obd_uuid)
1356 osc = get_osc(obd, self.uuid, fs_name)
1358 self.osclist.append(osc)
1360 panic('osc not found:', obd_uuid)
1366 if is_prepared(self.name):
1368 if self.config_only:
1369 panic("Can't prepare config_only LOV ", self.name)
1371 for osc in self.osclist:
1373 # Only ignore connect failures with --force, which
1374 # isn't implemented here yet.
1375 osc.prepare(ignore_connect_failure=0)
1376 except CommandError, e:
1377 print "Error preparing OSC %s\n" % osc.uuid
1379 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1380 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1381 lctl.lov_setup(self.name, self.uuid,
1382 self.desc_uuid, self.mds_name, self.stripe_cnt,
1383 self.stripe_sz, self.stripe_off, self.pattern,
1384 string.join(self.devlist))
1387 if is_prepared(self.name):
1388 Module.cleanup(self)
1389 if self.config_only:
1390 panic("Can't clean up config_only LOV ", self.name)
1391 for osc in self.osclist:
1394 def load_module(self):
1395 if self.config_only:
1396 panic("Can't load modules for config_only LOV ", self.name)
1397 for osc in self.osclist:
1400 Module.load_module(self)
1402 def cleanup_module(self):
1403 if self.config_only:
1404 panic("Can't cleanup modules for config_only LOV ", self.name)
1405 Module.cleanup_module(self)
1406 for osc in self.osclist:
1407 osc.cleanup_module()
1410 def correct_level(self, level, op=None):
1414 def __init__(self, db, uuid, fs_name, name_override = None):
1415 Module.__init__(self, 'LMV', db)
1416 if name_override != None:
1417 self.name = "lmv_%s" % name_override
1418 self.add_lustre_module('lmv', 'lmv')
1419 self.devlist = self.db.get_refs('mds')
1421 self.desc_uuid = self.uuid
1423 self.fs_name = fs_name
1424 for mds_uuid in self.devlist:
1425 mds = self.db.lookup(mds_uuid)
1427 panic("MDS not found!")
1428 mdc = MDC(mds, self.uuid, fs_name)
1430 self.mdclist.append(mdc)
1432 panic('mdc not found:', mds_uuid)
1435 if is_prepared(self.name):
1437 for mdc in self.mdclist:
1439 # Only ignore connect failures with --force, which
1440 # isn't implemented here yet.
1441 mdc.prepare(ignore_connect_failure=0)
1442 except CommandError, e:
1443 print "Error preparing LMV %s\n" % mdc.uuid
1445 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1446 string.join(self.devlist))
1449 for mdc in self.mdclist:
1451 if is_prepared(self.name):
1452 Module.cleanup(self)
1454 def load_module(self):
1455 for mdc in self.mdclist:
1458 Module.load_module(self)
1460 def cleanup_module(self):
1461 Module.cleanup_module(self)
1462 for mds in self.mdclist:
1463 mdc.cleanup_module()
1466 def correct_level(self, level, op=None):
1469 class MDSDEV(Module):
1470 def __init__(self,db):
1471 Module.__init__(self, 'MDSDEV', db)
1472 self.devpath = self.db.get_val('devpath','')
1473 self.backdevpath = self.db.get_val('backdevpath','')
1474 self.size = self.db.get_val_int('devsize', 0)
1475 self.journal_size = self.db.get_val_int('journalsize', 0)
1476 self.fstype = self.db.get_val('fstype', '')
1477 self.backfstype = self.db.get_val('backfstype', '')
1478 self.nspath = self.db.get_val('nspath', '')
1479 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1480 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1481 self.cachetype = self.db.get_val('cachetype', '')
1482 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1483 target_uuid = self.db.get_first_ref('target')
1484 mds = self.db.lookup(target_uuid)
1485 self.name = mds.getName()
1486 self.filesystem_uuids = mds.get_refs('filesystem')
1489 self.master_mds = ""
1490 if not self.filesystem_uuids:
1491 self.lmv_uuid = self.db.get_first_ref('lmv')
1492 if not self.lmv_uuid:
1493 panic("ALERT: can't find lvm uuid")
1495 self.lmv = self.db.lookup(self.lmv_uuid)
1497 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1498 self.master_mds = self.lmv_uuid
1499 # FIXME: if fstype not set, then determine based on kernel version
1500 self.format = self.db.get_val('autoformat', "no")
1501 if mds.get_val('failover', 0):
1502 self.failover_mds = 'f'
1504 self.failover_mds = 'n'
1505 active_uuid = get_active_target(mds)
1507 panic("No target device found:", target_uuid)
1508 if active_uuid == self.uuid:
1512 if self.active and config.group and config.group != mds.get_val('group'):
1515 self.inode_size = self.db.get_val_int('inodesize', 0)
1516 if self.inode_size == 0:
1517 # find the LOV for this MDS
1518 lovconfig_uuid = mds.get_first_ref('lovconfig')
1519 if not lovconfig_uuid:
1520 if not self.lmv_uuid:
1521 panic("No LOV found for lovconfig ", lovconfig.name)
1524 panic("No LMV initialized and not lovconfig_uuid found")
1526 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1527 lovconfig = self.lmv.lookup(lovconfig_uuid)
1528 lov_uuid = lovconfig.get_first_ref('lov')
1530 panic("No LOV found for lovconfig ", lovconfig.name)
1532 lovconfig = mds.lookup(lovconfig_uuid)
1533 lov_uuid = lovconfig.get_first_ref('lov')
1535 panic("No LOV found for lovconfig ", lovconfig.name)
1538 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1539 lovconfig = self.lmv.lookup(lovconfig_uuid)
1540 lov_uuid = lovconfig.get_first_ref('lov')
1542 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1544 # default stripe count controls default inode_size
1545 stripe_count = lov.stripe_cnt
1546 if stripe_count > 77:
1547 self.inode_size = 4096
1548 elif stripe_count > 35:
1549 self.inode_size = 2048
1550 elif stripe_count > 13:
1551 self.inode_size = 1024
1552 elif stripe_count > 3:
1553 self.inode_size = 512
1555 self.inode_size = 256
1557 self.target_dev_uuid = self.uuid
1558 self.uuid = target_uuid
1561 client_uuid = generate_client_uuid(self.name)
1562 client_uuid = self.name + "_lmv_" + "UUID"
1563 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1564 self.master_mds = self.master.name
1567 self.add_lustre_module('mdc', 'mdc')
1568 self.add_lustre_module('osc', 'osc')
1569 self.add_lustre_module('lov', 'lov')
1570 self.add_lustre_module('lmv', 'lmv')
1571 self.add_lustre_module('ost', 'ost')
1572 self.add_lustre_module('mds', 'mds')
1574 if self.fstype == 'smfs':
1575 self.add_lustre_module('smfs', 'smfs')
1577 if self.fstype == 'ldiskfs':
1578 self.add_lustre_module('ldiskfs', 'ldiskfs')
1581 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1583 # if fstype is smfs, then we should also take care about backing
1585 if self.fstype == 'smfs':
1586 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1588 def load_module(self):
1590 Module.load_module(self)
1593 if is_prepared(self.name):
1596 debug(self.uuid, "not active")
1599 # run write_conf automatically, if --reformat used
1601 self.info(self.devpath, self.fstype, self.size, self.format)
1605 self.master.prepare()
1606 # never reformat here
1607 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1608 self.format, self.journal_size, self.inode_size,
1609 self.mkfsoptions, self.backfstype, self.backdevpath)
1611 if not is_prepared('MDT'):
1612 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1614 mountfsoptions = def_mount_options(self.fstype, 'mds')
1616 if config.mountfsoptions:
1618 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1620 mountfsoptions = config.mountfsoptions
1621 if self.mountfsoptions:
1622 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1624 if self.mountfsoptions:
1626 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1628 mountfsoptions = self.mountfsoptions
1630 if self.fstype == 'smfs':
1631 realdev = self.fstype
1634 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1638 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1643 print 'MDS mount options: ' + mountfsoptions
1645 if not self.master_mds:
1646 self.master_mds = 'dumb'
1647 if not self.cachetype:
1648 self.cachetype = 'dumb'
1649 lctl.newdev("mds", self.name, self.uuid,
1650 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1651 self.name, mountfsoptions,
1652 self.master_mds, self.cachetype))
1653 except CommandError, e:
1655 panic("MDS is missing the config log. Need to run " +
1656 "lconf --write_conf.")
1660 def write_conf(self):
1661 if is_prepared(self.name):
1663 self.info(self.devpath, self.fstype, self.format)
1665 blkdev = block_dev(self.devpath, self.size, self.fstype,
1666 config.reformat, self.format, self.journal_size,
1667 self.inode_size, self.mkfsoptions, self.backfstype,
1670 # Even for writing logs we mount mds with supplied mount options
1671 # because it will not mount smfs (if used) otherwise.
1673 mountfsoptions = def_mount_options(self.fstype, 'mds')
1675 if config.mountfsoptions:
1677 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1679 mountfsoptions = config.mountfsoptions
1680 if self.mountfsoptions:
1681 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1683 if self.mountfsoptions:
1685 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1687 mountfsoptions = self.mountfsoptions
1689 if self.fstype == 'smfs':
1690 realdev = self.fstype
1693 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1697 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1702 print 'MDS mount options: ' + mountfsoptions
1704 # As mount options are passed by 4th param to config tool, we need
1705 # to pass something in 3rd param. But we do not want this 3rd param
1706 # be counted as a profile name for reading log on MDS setup, thus,
1707 # we pass there some predefined sign like 'dumb', which will be
1708 # checked in MDS code and skipped. Probably there is more nice way
1709 # like pass empty string and check it in config tool and pass null
1711 lctl.newdev("mds", self.name, self.uuid,
1712 setup ="%s %s %s %s" %(realdev, self.fstype,
1713 'dumb', mountfsoptions))
1714 # record logs for the MDS lov
1715 for uuid in self.filesystem_uuids:
1716 log("recording clients for filesystem:", uuid)
1717 fs = self.db.lookup(uuid)
1719 # this is ugly, should be organized nice later.
1720 target_uuid = self.db.get_first_ref('target')
1721 mds = self.db.lookup(target_uuid)
1723 lovconfig_uuid = mds.get_first_ref('lovconfig')
1725 lovconfig = mds.lookup(lovconfig_uuid)
1726 obd_uuid = lovconfig.get_first_ref('lov')
1728 obd_uuid = fs.get_first_ref('obd')
1730 client_uuid = generate_client_uuid(self.name)
1731 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1734 lctl.clear_log(self.name, self.name)
1735 lctl.record(self.name, self.name)
1737 lctl.mount_option(self.name, client.get_name(), "")
1741 lctl.clear_log(self.name, self.name + '-clean')
1742 lctl.record(self.name, self.name + '-clean')
1744 lctl.del_mount_option(self.name)
1749 # record logs for each client
1751 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1753 config_options = CONFIG_FILE
1755 for node_db in self.db.lookup_class('node'):
1756 client_name = node_db.getName()
1757 for prof_uuid in node_db.get_refs('profile'):
1758 prof_db = node_db.lookup(prof_uuid)
1759 # refactor this into a funtion to test "clientness"
1761 for ref_class, ref_uuid in prof_db.get_all_refs():
1762 if ref_class in ('mountpoint','echoclient'):
1763 debug("recording", client_name)
1764 old_noexec = config.noexec
1766 noexec_opt = ('', '-n')
1767 ret, out = run (sys.argv[0],
1768 noexec_opt[old_noexec == 1],
1769 " -v --record --nomod",
1770 "--record_log", client_name,
1771 "--record_device", self.name,
1772 "--node", client_name,
1775 for s in out: log("record> ", string.strip(s))
1776 ret, out = run (sys.argv[0],
1777 noexec_opt[old_noexec == 1],
1778 "--cleanup -v --record --nomod",
1779 "--record_log", client_name + "-clean",
1780 "--record_device", self.name,
1781 "--node", client_name,
1784 for s in out: log("record> ", string.strip(s))
1785 config.noexec = old_noexec
1787 lctl.cleanup(self.name, self.uuid, 0, 0)
1788 except CommandError, e:
1789 log(self.module_name, "cleanup failed: ", self.name)
1792 Module.cleanup(self)
1794 if self.fstype == 'smfs':
1795 clean_loop(self.backdevpath)
1797 clean_loop(self.devpath)
1799 def msd_remaining(self):
1800 out = lctl.device_list()
1802 if string.split(s)[2] in ('mds',):
1805 def safe_to_clean(self):
1808 def safe_to_clean_modules(self):
1809 return not self.msd_remaining()
1813 debug(self.uuid, "not active")
1816 if is_prepared(self.name):
1818 lctl.cleanup(self.name, self.uuid, config.force,
1820 except CommandError, e:
1821 log(self.module_name, "cleanup failed: ", self.name)
1824 Module.cleanup(self)
1827 self.master.cleanup()
1828 if not self.msd_remaining() and is_prepared('MDT'):
1830 lctl.cleanup("MDT", "MDT_UUID", config.force,
1832 except CommandError, e:
1833 print "cleanup failed: ", self.name
1837 if self.fstype == 'smfs':
1838 clean_loop(self.backdevpath)
1840 clean_loop(self.devpath)
1842 def correct_level(self, level, op=None):
1843 #if self.master_mds:
1848 def __init__(self, db):
1849 Module.__init__(self, 'OSD', db)
1850 self.osdtype = self.db.get_val('osdtype')
1851 self.devpath = self.db.get_val('devpath', '')
1852 self.backdevpath = self.db.get_val('backdevpath', '')
1853 self.size = self.db.get_val_int('devsize', 0)
1854 self.journal_size = self.db.get_val_int('journalsize', 0)
1855 self.inode_size = self.db.get_val_int('inodesize', 0)
1856 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1857 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1858 self.fstype = self.db.get_val('fstype', '')
1859 self.backfstype = self.db.get_val('backfstype', '')
1860 self.nspath = self.db.get_val('nspath', '')
1861 target_uuid = self.db.get_first_ref('target')
1862 ost = self.db.lookup(target_uuid)
1863 self.name = ost.getName()
1864 self.format = self.db.get_val('autoformat', 'yes')
1865 if ost.get_val('failover', 0):
1866 self.failover_ost = 'f'
1868 self.failover_ost = 'n'
1870 active_uuid = get_active_target(ost)
1872 panic("No target device found:", target_uuid)
1873 if active_uuid == self.uuid:
1877 if self.active and config.group and config.group != ost.get_val('group'):
1880 self.target_dev_uuid = self.uuid
1881 self.uuid = target_uuid
1883 self.add_lustre_module('ost', 'ost')
1884 if self.fstype == 'smfs':
1885 self.add_lustre_module('smfs', 'smfs')
1886 # FIXME: should we default to ext3 here?
1887 if self.fstype == 'ldiskfs':
1888 self.add_lustre_module('ldiskfs', 'ldiskfs')
1890 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1891 if self.fstype == 'smfs':
1892 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1894 self.add_lustre_module(self.osdtype, self.osdtype)
1896 def load_module(self):
1898 Module.load_module(self)
1900 # need to check /proc/mounts and /etc/mtab before
1901 # formatting anything.
1902 # FIXME: check if device is already formatted.
1904 if is_prepared(self.name):
1907 debug(self.uuid, "not active")
1909 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1910 self.format, self.journal_size, self.inode_size)
1912 if self.osdtype == 'obdecho':
1915 blkdev = block_dev(self.devpath, self.size, self.fstype,
1916 config.reformat, self.format, self.journal_size,
1917 self.inode_size, self.mkfsoptions, self.backfstype,
1920 mountfsoptions = def_mount_options(self.fstype, 'ost')
1922 if config.mountfsoptions:
1924 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1926 mountfsoptions = config.mountfsoptions
1927 if self.mountfsoptions:
1928 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1930 if self.mountfsoptions:
1932 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1934 mountfsoptions = self.mountfsoptions
1936 if self.fstype == 'smfs':
1937 realdev = self.fstype
1940 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1944 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1949 print 'OSD mount options: ' + mountfsoptions
1951 lctl.newdev(self.osdtype, self.name, self.uuid,
1952 setup ="%s %s %s %s" %(realdev, self.fstype,
1955 if not is_prepared('OSS'):
1956 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1958 def osd_remaining(self):
1959 out = lctl.device_list()
1961 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1964 def safe_to_clean(self):
1967 def safe_to_clean_modules(self):
1968 return not self.osd_remaining()
1972 debug(self.uuid, "not active")
1974 if is_prepared(self.name):
1977 lctl.cleanup(self.name, self.uuid, config.force,
1979 except CommandError, e:
1980 log(self.module_name, "cleanup failed: ", self.name)
1983 if not self.osd_remaining() and is_prepared('OSS'):
1985 lctl.cleanup("OSS", "OSS_UUID", config.force,
1987 except CommandError, e:
1988 print "cleanup failed: ", self.name
1991 if not self.osdtype == 'obdecho':
1992 if self.fstype == 'smfs':
1993 clean_loop(self.backdevpath)
1995 clean_loop(self.devpath)
1997 def correct_level(self, level, op=None):
2000 def mgmt_uuid_for_fs(mtpt_name):
2003 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2004 fs_uuid = mtpt_db.get_first_ref('filesystem')
2005 fs = toplustreDB.lookup(fs_uuid)
2008 return fs.get_first_ref('mgmt')
2010 # Generic client module, used by OSC and MDC
2011 class Client(Module):
2012 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2014 self.target_name = tgtdb.getName()
2015 self.target_uuid = tgtdb.getUUID()
2018 self.tgt_dev_uuid = get_active_target(tgtdb)
2019 if not self.tgt_dev_uuid:
2020 panic("No target device found for target(1):", self.target_name)
2022 self.kmod = kmod(config.lustre, config.portals)
2026 self.module = module
2027 self.module_name = string.upper(module)
2029 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2030 self.target_name, fs_name)
2032 self.name = self_name
2034 self.lookup_server(self.tgt_dev_uuid)
2035 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2037 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2040 self.fs_name = fs_name
2043 self.add_lustre_module(module_dir, module)
2045 def lookup_server(self, srv_uuid):
2046 """ Lookup a server's network information """
2047 self._server_nets = get_ost_net(self.db, srv_uuid)
2048 if len(self._server_nets) == 0:
2049 panic ("Unable to find a server for:", srv_uuid)
2052 def get_servers(self):
2053 return self._server_nets
2055 def prepare(self, ignore_connect_failure = 0):
2056 self.info(self.target_uuid)
2057 if is_prepared(self.name):
2060 srv = choose_local_server(self.get_servers())
2064 routes = find_route(self.get_servers())
2065 if len(routes) == 0:
2066 panic ("no route to", self.target_uuid)
2067 for (srv, r) in routes:
2068 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2069 except CommandError, e:
2070 if not ignore_connect_failure:
2073 if self.target_uuid in config.inactive and self.permits_inactive():
2074 debug("%s inactive" % self.target_uuid)
2075 inactive_p = "inactive"
2077 debug("%s active" % self.target_uuid)
2079 lctl.newdev(self.module, self.name, self.uuid,
2080 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2081 inactive_p, self.mgmt_name))
2084 if is_prepared(self.name):
2085 Module.cleanup(self)
2087 srv = choose_local_server(self.get_servers())
2089 lctl.disconnect(srv)
2091 for (srv, r) in find_route(self.get_servers()):
2092 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2093 except CommandError, e:
2094 log(self.module_name, "cleanup failed: ", self.name)
2098 def correct_level(self, level, op=None):
2103 def __init__(self, db, uuid, fs_name):
2104 Client.__init__(self, db, uuid, 'mdc', fs_name)
2106 def permits_inactive(self):
2110 def __init__(self, db, uuid, fs_name):
2111 Client.__init__(self, db, uuid, 'osc', fs_name)
2113 def permits_inactive(self):
2116 def mgmtcli_name_for_uuid(uuid):
2117 return 'MGMTCLI_%s' % uuid
2119 class ManagementClient(Client):
2120 def __init__(self, db, uuid):
2121 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2122 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2123 module_dir = 'mgmt')
2126 def __init__(self, db, uuid, name, type, name_override = None):
2127 Module.__init__(self, 'COBD', db)
2128 self.name = self.db.getName();
2129 self.uuid = generate_client_uuid(self.name)
2130 self.real_uuid = self.db.get_first_ref('realobd')
2131 self.cache_uuid = self.db.get_first_ref('cacheobd')
2132 self.add_lustre_module('cobd', 'cobd')
2133 real_obd = self.db.lookup(self.real_uuid)
2135 panic('real obd not found:', self.real_uuid)
2136 cache_obd = self.db.lookup(self.cache_uuid)
2138 panic('cache obd not found:', self.cache_uuid)
2140 self.real = LOV(real_obd, self.real_uuid, name,
2141 "%s_real" % (self.name));
2142 self.cache = LOV(cache_obd, self.cache_uuid, name,
2143 "%s_cache" % (self.name));
2145 self.real = get_mdc(db, uuid, name, self.real_uuid)
2146 self.cache = get_mdc(db, uuid, name, self.cache_uuid)
2147 # need to check /proc/mounts and /etc/mtab before
2148 # formatting anything.
2149 # FIXME: check if device is already formatted.
2154 def get_real_name(self):
2155 return self.real.name
2156 def get_cache_name(self):
2157 return self.cache.name
2160 self.cache.prepare()
2161 if is_prepared(self.name):
2163 self.info(self.real_uuid, self.cache_uuid)
2164 lctl.newdev("cobd", self.name, self.uuid,
2165 setup ="%s %s" %(self.real.name,
2169 if is_prepared(self.name):
2170 Module.cleanup(self)
2172 self.cache.cleanup()
2174 def load_module(self):
2175 self.real.load_module()
2176 Module.load_module(self)
2178 def cleanup_module(self):
2179 Module.cleanup_module(self)
2180 self.real.cleanup_module()
2182 # virtual interface for OSC and LOV
2184 def __init__(self, db, client_uuid, name, name_override = None):
2185 Module.__init__(self, 'VOSC', db)
2186 if db.get_class() == 'lov':
2187 self.osc = LOV(db, client_uuid, name, name_override)
2189 elif db.get_class() == 'cobd':
2190 self.osc = COBD(db, client_uuid, name, 'obd')
2193 self.osc = OSC(db, client_uuid, name)
2196 return self.osc.get_uuid()
2198 return self.osc.get_name()
2203 def load_module(self):
2204 self.osc.load_module()
2205 def cleanup_module(self):
2206 self.osc.cleanup_module()
2207 def correct_level(self, level, op=None):
2208 return self.osc.correct_level(level, op)
2210 # virtual interface for MDC and LMV
2212 def __init__(self, db, client_uuid, name, name_override = None):
2213 Module.__init__(self, 'VMDC', db)
2214 if db.get_class() == 'lmv':
2215 self.mdc = LMV(db, client_uuid, name)
2216 elif db.get_class() == 'cobd':
2217 self.mdc = COBD(db, client_uuid, name, 'mds')
2219 self.mdc = MDC(db, client_uuid, name)
2221 return self.mdc.uuid
2223 return self.mdc.name
2228 def load_module(self):
2229 self.mdc.load_module()
2230 def cleanup_module(self):
2231 self.mdc.cleanup_module()
2232 def correct_level(self, level, op=None):
2233 return self.mdc.correct_level(level, op)
2235 class ECHO_CLIENT(Module):
2236 def __init__(self,db):
2237 Module.__init__(self, 'ECHO_CLIENT', db)
2238 self.add_lustre_module('obdecho', 'obdecho')
2239 self.obd_uuid = self.db.get_first_ref('obd')
2240 obd = self.db.lookup(self.obd_uuid)
2241 self.uuid = generate_client_uuid(self.name)
2242 self.osc = VOSC(obd, self.uuid, self.name)
2245 if is_prepared(self.name):
2248 self.osc.prepare() # XXX This is so cheating. -p
2249 self.info(self.obd_uuid)
2251 lctl.newdev("echo_client", self.name, self.uuid,
2252 setup = self.osc.get_name())
2255 if is_prepared(self.name):
2256 Module.cleanup(self)
2259 def load_module(self):
2260 self.osc.load_module()
2261 Module.load_module(self)
2263 def cleanup_module(self):
2264 Module.cleanup_module(self)
2265 self.osc.cleanup_module()
2267 def correct_level(self, level, op=None):
2270 def generate_client_uuid(name):
2271 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2273 int(random.random() * 1048576),
2274 int(random.random() * 1048576))
2275 return client_uuid[:36]
2277 class Mountpoint(Module):
2278 def __init__(self,db):
2279 Module.__init__(self, 'MTPT', db)
2280 self.path = self.db.get_val('path')
2281 self.fs_uuid = self.db.get_first_ref('filesystem')
2282 fs = self.db.lookup(self.fs_uuid)
2283 self.mds_uuid = fs.get_first_ref('lmv')
2284 if not self.mds_uuid:
2285 self.mds_uuid = fs.get_first_ref('mds')
2286 self.obd_uuid = fs.get_first_ref('obd')
2287 self.mgmt_uuid = fs.get_first_ref('mgmt')
2288 client_uuid = generate_client_uuid(self.name)
2290 ost = self.db.lookup(self.obd_uuid)
2292 panic("no ost: ", self.obd_uuid)
2294 mds = self.db.lookup(self.mds_uuid)
2296 panic("no mds: ", self.mds_uuid)
2298 self.add_lustre_module('mdc', 'mdc')
2299 self.add_lustre_module('lmv', 'lmv')
2300 self.add_lustre_module('llite', 'llite')
2302 self.vosc = VOSC(ost, client_uuid, self.name)
2303 self.vmdc = VMDC(mds, client_uuid, self.name)
2306 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2312 if fs_is_mounted(self.path):
2313 log(self.path, "already mounted.")
2317 self.mgmtcli.prepare()
2320 vmdc_name = self.vmdc.get_name()
2322 self.info(self.path, self.mds_uuid, self.obd_uuid)
2323 if config.record or config.lctl_dump:
2324 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2326 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2327 (self.vosc.get_name(), vmdc_name, config.config, self.path)
2328 run("mkdir", self.path)
2333 panic("mount failed:", self.path, ":", string.join(val))
2336 self.info(self.path, self.mds_uuid,self.obd_uuid)
2338 if config.record or config.lctl_dump:
2339 lctl.del_mount_option(local_node_name)
2341 if fs_is_mounted(self.path):
2343 (rc, out) = run("umount", "-f", self.path)
2345 (rc, out) = run("umount", self.path)
2347 raise CommandError('umount', out, rc)
2349 if fs_is_mounted(self.path):
2350 panic("fs is still mounted:", self.path)
2355 self.mgmtcli.cleanup()
2357 def load_module(self):
2359 self.mgmtcli.load_module()
2360 self.vosc.load_module()
2361 Module.load_module(self)
2363 def cleanup_module(self):
2364 Module.cleanup_module(self)
2365 self.vosc.cleanup_module()
2367 self.mgmtcli.cleanup_module()
2369 def correct_level(self, level, op=None):
2372 # ============================================================
2373 # misc query functions
2375 def get_ost_net(self, osd_uuid):
2379 osd = self.lookup(osd_uuid)
2380 node_uuid = osd.get_first_ref('node')
2381 node = self.lookup(node_uuid)
2383 panic("unable to find node for osd_uuid:", osd_uuid,
2384 " node_ref:", node_uuid_)
2385 for net_uuid in node.get_networks():
2386 db = node.lookup(net_uuid)
2387 srv_list.append(Network(db))
2391 # the order of iniitailization is based on level.
2392 def getServiceLevel(self):
2393 type = self.get_class()
2395 if type in ('network',):
2397 elif type in ('routetbl',):
2399 elif type in ('ldlm',):
2401 elif type in ('mgmt',):
2403 elif type in ('osd', 'cobd'):
2405 elif type in ('mdsdev',):
2407 elif type in ('lmv',):
2409 elif type in ('mountpoint', 'echoclient'):
2412 panic("Unknown type: ", type)
2414 if ret < config.minlevel or ret > config.maxlevel:
2419 # return list of services in a profile. list is a list of tuples
2420 # [(level, db_object),]
2421 def getServices(self):
2423 for ref_class, ref_uuid in self.get_all_refs():
2424 servdb = self.lookup(ref_uuid)
2426 level = getServiceLevel(servdb)
2428 list.append((level, servdb))
2430 panic('service not found: ' + ref_uuid)
2436 ############################################################
2438 # FIXME: clean this mess up!
2440 # OSC is no longer in the xml, so we have to fake it.
2441 # this is getting ugly and begging for another refactoring
2442 def get_osc(ost_db, uuid, fs_name):
2443 osc = OSC(ost_db, uuid, fs_name)
2446 def get_mdc(db, uuid, fs_name, mds_uuid):
2447 mds_db = db.lookup(mds_uuid);
2449 error("no mds:", mds_uuid)
2450 mdc = MDC(mds_db, mds_uuid, fs_name)
2453 ############################################################
2454 # routing ("rooting")
2455 # list of (nettype, cluster_id, nid)
2458 def find_local_clusters(node_db):
2459 global local_clusters
2460 for netuuid in node_db.get_networks():
2461 net = node_db.lookup(netuuid)
2463 debug("add_local", netuuid)
2464 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2466 if acceptors.has_key(srv.port):
2467 panic("duplicate port:", srv.port)
2468 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2469 srv.send_mem, srv.recv_mem,
2472 # This node is a gateway.
2474 def node_is_router():
2477 # If there are any routers found in the config, then this will be true
2478 # and all nodes will load kptlrouter.
2480 def node_needs_router():
2481 return needs_router or is_router
2483 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2484 # Currently, these local routes are only added to kptlrouter route
2485 # table if they are needed to connect to a specific server. This
2486 # should be changed so all available routes are loaded, and the
2487 # ptlrouter can make all the decisions.
2490 def find_local_routes(lustre):
2491 """ Scan the lustre config looking for routers . Build list of
2493 global local_routes, needs_router
2495 list = lustre.lookup_class('node')
2497 if router.get_val_int('router', 0):
2499 for (local_type, local_cluster_id, local_nid) in local_clusters:
2501 for netuuid in router.get_networks():
2502 db = router.lookup(netuuid)
2503 if (local_type == db.get_val('nettype') and
2504 local_cluster_id == db.get_val('clusterid')):
2505 gw = db.get_val('nid')
2508 debug("find_local_routes: gw is", gw)
2509 for route in router.get_local_routes(local_type, gw):
2510 local_routes.append(route)
2511 debug("find_local_routes:", local_routes)
2514 def choose_local_server(srv_list):
2515 for srv in srv_list:
2516 if local_cluster(srv.net_type, srv.cluster_id):
2519 def local_cluster(net_type, cluster_id):
2520 for cluster in local_clusters:
2521 if net_type == cluster[0] and cluster_id == cluster[1]:
2525 def local_interface(net_type, cluster_id, nid):
2526 for cluster in local_clusters:
2527 if (net_type == cluster[0] and cluster_id == cluster[1]
2528 and nid == cluster[2]):
2532 def find_route(srv_list):
2534 frm_type = local_clusters[0][0]
2535 for srv in srv_list:
2536 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2537 to_type = srv.net_type
2539 cluster_id = srv.cluster_id
2540 debug ('looking for route to', to_type, to)
2541 for r in local_routes:
2542 debug("find_route: ", r)
2543 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2544 result.append((srv, r))
2547 def get_active_target(db):
2548 target_uuid = db.getUUID()
2549 target_name = db.getName()
2550 node_name = get_select(target_name)
2552 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2554 tgt_dev_uuid = db.get_first_ref('active')
2557 def get_server_by_nid_uuid(db, nid_uuid):
2558 for n in db.lookup_class("network"):
2560 if net.nid_uuid == nid_uuid:
2564 ############################################################
2568 type = db.get_class()
2569 debug('Service:', type, db.getName(), db.getUUID())
2574 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2575 elif type == 'network':
2577 elif type == 'routetbl':
2581 elif type == 'cobd':
2582 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2583 elif type == 'mdsdev':
2585 elif type == 'mountpoint':
2587 elif type == 'echoclient':
2589 elif type == 'mgmt':
2594 panic ("unknown service type:", type)
2598 # Prepare the system to run lustre using a particular profile
2599 # in a the configuration.
2600 # * load & the modules
2601 # * setup networking for the current node
2602 # * make sure partitions are in place and prepared
2603 # * initialize devices with lctl
2604 # Levels is important, and needs to be enforced.
2605 def for_each_profile(db, prof_list, operation):
2606 for prof_uuid in prof_list:
2607 prof_db = db.lookup(prof_uuid)
2609 panic("profile:", profile, "not found.")
2610 services = getServices(prof_db)
2613 def doWriteconf(services):
2617 if s[1].get_class() == 'mdsdev':
2618 n = newService(s[1])
2621 def doSetup(services):
2626 n = newService(s[1])
2628 slist.append((n.level, n))
2631 nl = n[1].correct_level(n[0])
2632 nlist.append((nl, n[1]))
2637 def doModules(services):
2641 n = newService(s[1])
2644 def doCleanup(services):
2649 n = newService(s[1])
2651 slist.append((n.level, n))
2654 nl = n[1].correct_level(n[0])
2655 nlist.append((nl, n[1]))
2659 if n[1].safe_to_clean():
2662 def doUnloadModules(services):
2667 n = newService(s[1])
2668 if n.safe_to_clean_modules():
2673 def doHost(lustreDB, hosts):
2674 global is_router, local_node_name
2677 node_db = lustreDB.lookup_name(h, 'node')
2681 panic('No host entry found.')
2683 local_node_name = node_db.get_val('name', 0)
2684 is_router = node_db.get_val_int('router', 0)
2685 lustre_upcall = node_db.get_val('lustreUpcall', '')
2686 portals_upcall = node_db.get_val('portalsUpcall', '')
2687 timeout = node_db.get_val_int('timeout', 0)
2688 ptldebug = node_db.get_val('ptldebug', '')
2689 subsystem = node_db.get_val('subsystem', '')
2691 find_local_clusters(node_db)
2693 find_local_routes(lustreDB)
2695 # Two step process: (1) load modules, (2) setup lustre
2696 # if not cleaning, load modules first.
2697 prof_list = node_db.get_refs('profile')
2699 if config.write_conf:
2701 for_each_profile(node_db, prof_list, doModules)
2703 for_each_profile(node_db, prof_list, doWriteconf)
2704 for_each_profile(node_db, prof_list, doUnloadModules)
2706 elif config.recover:
2707 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2708 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2709 "--client_uuid <UUID> --conn_uuid <UUID>")
2710 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2712 elif config.cleanup:
2714 # the command line can override this value
2716 # ugly hack, only need to run lctl commands for --dump
2717 if config.lctl_dump or config.record:
2718 for_each_profile(node_db, prof_list, doCleanup)
2721 sys_set_timeout(timeout)
2722 sys_set_ptldebug(ptldebug)
2723 sys_set_subsystem(subsystem)
2724 sys_set_lustre_upcall(lustre_upcall)
2725 sys_set_portals_upcall(portals_upcall)
2727 for_each_profile(node_db, prof_list, doCleanup)
2728 for_each_profile(node_db, prof_list, doUnloadModules)
2732 # ugly hack, only need to run lctl commands for --dump
2733 if config.lctl_dump or config.record:
2734 sys_set_timeout(timeout)
2735 sys_set_lustre_upcall(lustre_upcall)
2736 for_each_profile(node_db, prof_list, doSetup)
2740 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2741 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2743 for_each_profile(node_db, prof_list, doModules)
2745 sys_set_debug_path()
2746 sys_set_ptldebug(ptldebug)
2747 sys_set_subsystem(subsystem)
2748 script = config.gdb_script
2749 run(lctl.lctl, ' modules >', script)
2751 log ("The GDB module script is in", script)
2752 # pause, so user has time to break and
2755 sys_set_timeout(timeout)
2756 sys_set_lustre_upcall(lustre_upcall)
2757 sys_set_portals_upcall(portals_upcall)
2759 for_each_profile(node_db, prof_list, doSetup)
2762 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2763 tgt = lustreDB.lookup(tgt_uuid)
2765 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2766 new_uuid = get_active_target(tgt)
2768 raise Lustre.LconfError("doRecovery: no active target found for: " +
2770 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2772 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2774 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2776 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2779 lctl.disconnect(oldnet)
2780 except CommandError, e:
2781 log("recover: disconnect", nid_uuid, "failed: ")
2786 except CommandError, e:
2787 log("recover: connect failed")
2790 lctl.recover(client_uuid, net.nid_uuid)
2793 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2794 base = os.path.dirname(cmd)
2795 if development_mode():
2796 if not config.lustre:
2797 debug('using objdir module paths')
2798 config.lustre = (os.path.join(base, ".."))
2799 # normalize the portals dir, using command line arg if set
2801 portals_dir = config.portals
2802 dir = os.path.join(config.lustre, portals_dir)
2803 config.portals = dir
2804 debug('config.portals', config.portals)
2805 elif config.lustre and config.portals:
2807 # if --lustre and --portals, normalize portals
2808 # can ignore POTRALS_DIR here, since it is probly useless here
2809 config.portals = os.path.join(config.lustre, config.portals)
2810 debug('config.portals B', config.portals)
2812 def sysctl(path, val):
2813 debug("+ sysctl", path, val)
2817 fp = open(os.path.join('/proc/sys', path), 'w')
2824 def sys_set_debug_path():
2825 sysctl('portals/debug_path', config.debug_path)
2827 def sys_set_lustre_upcall(upcall):
2828 # the command overrides the value in the node config
2829 if config.lustre_upcall:
2830 upcall = config.lustre_upcall
2832 upcall = config.upcall
2834 lctl.set_lustre_upcall(upcall)
2836 def sys_set_portals_upcall(upcall):
2837 # the command overrides the value in the node config
2838 if config.portals_upcall:
2839 upcall = config.portals_upcall
2841 upcall = config.upcall
2843 sysctl('portals/upcall', upcall)
2845 def sys_set_timeout(timeout):
2846 # the command overrides the value in the node config
2847 if config.timeout and config.timeout > 0:
2848 timeout = config.timeout
2849 if timeout != None and timeout > 0:
2850 lctl.set_timeout(timeout)
2852 def sys_tweak_socknal ():
2853 if config.single_socket:
2854 sysctl("socknal/typed", 0)
2856 def sys_optimize_elan ():
2857 procfiles = ["/proc/elan/config/eventint_punt_loops",
2858 "/proc/qsnet/elan3/config/eventint_punt_loops",
2859 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2861 if os.access(p, os.R_OK):
2862 run ("echo 1 > " + p)
2864 def sys_set_ptldebug(ptldebug):
2866 ptldebug = config.ptldebug
2869 val = eval(ptldebug, ptldebug_names)
2870 val = "0x%x" % (val)
2871 sysctl('portals/debug', val)
2872 except NameError, e:
2875 def sys_set_subsystem(subsystem):
2876 if config.subsystem:
2877 subsystem = config.subsystem
2880 val = eval(subsystem, subsystem_names)
2881 val = "0x%x" % (val)
2882 sysctl('portals/subsystem_debug', val)
2883 except NameError, e:
2886 def sys_set_netmem_max(path, max):
2887 debug("setting", path, "to at least", max)
2895 fp = open(path, 'w')
2896 fp.write('%d\n' %(max))
2900 def sys_make_devices():
2901 if not os.access('/dev/portals', os.R_OK):
2902 run('mknod /dev/portals c 10 240')
2903 if not os.access('/dev/obd', os.R_OK):
2904 run('mknod /dev/obd c 10 241')
2907 # Add dir to the global PATH, if not already there.
2908 def add_to_path(new_dir):
2909 syspath = string.split(os.environ['PATH'], ':')
2910 if new_dir in syspath:
2912 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2914 def default_debug_path():
2915 path = '/tmp/lustre-log'
2916 if os.path.isdir('/r'):
2921 def default_gdb_script():
2922 script = '/tmp/ogdb'
2923 if os.path.isdir('/r'):
2924 return '/r' + script
2929 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2930 # ensure basic elements are in the system path
2931 def sanitise_path():
2932 for dir in DEFAULT_PATH:
2935 # global hack for the --select handling
2937 def init_select(args):
2938 # args = [service=nodeA,service2=nodeB service3=nodeC]
2941 list = string.split(arg, ',')
2943 srv, node = string.split(entry, '=')
2944 tgt_select[srv] = node
2946 def get_select(srv):
2947 if tgt_select.has_key(srv):
2948 return tgt_select[srv]
2952 FLAG = Lustre.Options.FLAG
2953 PARAM = Lustre.Options.PARAM
2954 INTPARAM = Lustre.Options.INTPARAM
2955 PARAMLIST = Lustre.Options.PARAMLIST
2957 ('verbose,v', "Print system commands as they are run"),
2958 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2959 ('config', "Cluster config name used for LDAP query", PARAM),
2960 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2961 ('node', "Load config for <nodename>", PARAM),
2962 ('cleanup,d', "Cleans up config. (Shutdown)"),
2963 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2965 ('single_socket', "socknal option: only use one socket instead of bundle",
2967 ('failover',"""Used to shut down without saving state.
2968 This will allow this node to "give up" a service to a
2969 another node for failover purposes. This will not
2970 be a clean shutdown.""",
2972 ('gdb', """Prints message after creating gdb module script
2973 and sleeps for 5 seconds."""),
2974 ('noexec,n', """Prints the commands and steps that will be run for a
2975 config without executing them. This can used to check if a
2976 config file is doing what it should be doing"""),
2977 ('nomod', "Skip load/unload module step."),
2978 ('nosetup', "Skip device setup/cleanup step."),
2979 ('reformat', "Reformat all devices (without question)"),
2980 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2981 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2982 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2984 ('write_conf', "Save all the client config information on mds."),
2985 ('record', "Write config information on mds."),
2986 ('record_log', "Name of config record log.", PARAM),
2987 ('record_device', "MDS device name that will record the config commands",
2989 ('minlevel', "Minimum level of services to configure/cleanup",
2991 ('maxlevel', """Maximum level of services to configure/cleanup
2992 Levels are aproximatly like:
2997 70 - mountpoint, echo_client, osc, mdc, lov""",
2999 ('lustre', """Base directory of lustre sources. This parameter will
3000 cause lconf to load modules from a source tree.""", PARAM),
3001 ('portals', """Portals source directory. If this is a relative path,
3002 then it is assumed to be relative to lustre. """, PARAM),
3003 ('timeout', "Set recovery timeout", INTPARAM),
3004 ('upcall', "Set both portals and lustre upcall script", PARAM),
3005 ('lustre_upcall', "Set lustre upcall script", PARAM),
3006 ('portals_upcall', "Set portals upcall script", PARAM),
3007 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3008 ('ptldebug', "Set the portals debug level", PARAM),
3009 ('subsystem', "Set the portals debug subsystem", PARAM),
3010 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3011 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3012 # Client recovery options
3013 ('recover', "Recover a device"),
3014 ('group', "The group of devices to configure or cleanup", PARAM),
3015 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3016 ('client_uuid', "The failed client (required for recovery)", PARAM),
3017 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3019 ('inactive', """The name of an inactive service, to be ignored during
3020 mounting (currently OST-only). Can be repeated.""",
3025 global lctl, config, toplustreDB, CONFIG_FILE
3027 # in the upcall this is set to SIG_IGN
3028 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3030 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3032 config, args = cl.parse(sys.argv[1:])
3033 except Lustre.OptionError, e:
3037 setupModulePath(sys.argv[0])
3039 host = socket.gethostname()
3041 # the PRNG is normally seeded with time(), which is not so good for starting
3042 # time-synchronized clusters
3043 input = open('/dev/urandom', 'r')
3045 print 'Unable to open /dev/urandom!'
3047 seed = input.read(32)
3053 init_select(config.select)
3056 # allow config to be fetched via HTTP, but only with python2
3057 if sys.version[0] != '1' and args[0].startswith('http://'):
3060 config_file = urllib2.urlopen(args[0])
3061 except (urllib2.URLError, socket.error), err:
3062 if hasattr(err, 'args'):
3064 print "Could not access '%s': %s" %(args[0], err)
3066 elif not os.access(args[0], os.R_OK):
3067 print 'File not found or readable:', args[0]
3071 config_file = open(args[0], 'r')
3073 dom = xml.dom.minidom.parse(config_file)
3075 panic("%s does not appear to be a config file." % (args[0]))
3076 sys.exit(1) # make sure to die here, even in debug mode.
3078 CONFIG_FILE = args[0]
3079 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3080 if not config.config:
3081 config.config = os.path.basename(args[0])# use full path?
3082 if config.config[-4:] == '.xml':
3083 config.config = config.config[:-4]
3084 elif config.ldapurl:
3085 if not config.config:
3086 panic("--ldapurl requires --config name")
3087 dn = "config=%s,fs=lustre" % (config.config)
3088 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3089 elif config.ptldebug or config.subsystem:
3090 sys_set_ptldebug(None)
3091 sys_set_subsystem(None)
3094 print 'Missing config file or ldap URL.'
3095 print 'see lconf --help for command summary'
3098 toplustreDB = lustreDB
3100 ver = lustreDB.get_version()
3102 panic("No version found in config data, please recreate.")
3103 if ver != Lustre.CONFIG_VERSION:
3104 panic("Config version", ver, "does not match lconf version",
3105 Lustre.CONFIG_VERSION)
3109 node_list.append(config.node)
3112 node_list.append(host)
3113 node_list.append('localhost')
3115 debug("configuring for host: ", node_list)
3118 config.debug_path = config.debug_path + '-' + host
3119 config.gdb_script = config.gdb_script + '-' + host
3121 lctl = LCTLInterface('lctl')
3123 if config.lctl_dump:
3124 lctl.use_save_file(config.lctl_dump)
3127 if not (config.record_device and config.record_log):
3128 panic("When recording, both --record_log and --record_device must be specified.")
3129 lctl.clear_log(config.record_device, config.record_log)
3130 lctl.record(config.record_device, config.record_log)
3132 doHost(lustreDB, node_list)
3137 if __name__ == "__main__":
3140 except Lustre.LconfError, e:
3142 # traceback.print_exc(file=sys.stdout)
3144 except CommandError, e:
3148 if first_cleanup_error:
3149 sys.exit(first_cleanup_error)