3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
359 # print "LCTL:", cmds
361 # From "Python Cookbook" from O'Reilly
362 outfile = child.fromchild
363 outfd = outfile.fileno()
364 self.set_nonblock(outfd)
365 errfile = child.childerr
366 errfd = errfile.fileno()
367 self.set_nonblock(errfd)
369 outdata = errdata = ''
372 ready = select.select([outfd,errfd],[],[]) # Wait for input
373 if outfd in ready[0]:
374 outchunk = outfile.read()
375 if outchunk == '': outeof = 1
376 outdata = outdata + outchunk
377 if errfd in ready[0]:
378 errchunk = errfile.read()
379 if errchunk == '': erreof = 1
380 errdata = errdata + errchunk
381 if outeof and erreof: break
382 # end of "borrowed" code
385 if os.WIFEXITED(ret):
386 rc = os.WEXITSTATUS(ret)
389 if rc or len(errdata):
390 raise CommandError(self.lctl, errdata, rc)
393 def runcmd(self, *args):
395 run lctl using the command line
397 cmd = string.join(map(str,args))
398 debug("+", self.lctl, cmd)
399 rc, out = run(self.lctl, cmd)
401 raise CommandError(self.lctl, out, rc)
405 def clear_log(self, dev, log):
406 """ clear an existing log """
411 quit """ % (dev, log)
414 def network(self, net, nid):
419 quit """ % (net, nid)
422 # create a new connection
423 def add_uuid(self, net_type, uuid, nid):
424 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
427 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
429 if net_type in ('tcp',) and not config.lctl_dump:
434 add_autoconn %s %s %d %s
438 nid, hostaddr, port, flags )
441 def connect(self, srv):
442 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
443 if srv.net_type in ('tcp',) and not config.lctl_dump:
447 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
448 srv.nid, srv.hostaddr, srv.port, flags)
451 def recover(self, dev_name, new_conn):
454 recover %s""" %(dev_name, new_conn)
457 # add a route to a range
458 def add_route(self, net, gw, lo, hi):
466 except CommandError, e:
470 def del_route(self, net, gw, lo, hi):
475 quit """ % (net, gw, lo, hi)
478 # add a route to a host
479 def add_route_host(self, net, uuid, gw, tgt):
480 self.add_uuid(net, uuid, tgt)
488 except CommandError, e:
492 # add a route to a range
493 def del_route_host(self, net, uuid, gw, tgt):
499 quit """ % (net, gw, tgt)
503 def del_autoconn(self, net_type, nid, hostaddr):
504 if net_type in ('tcp',) and not config.lctl_dump:
513 # disconnect one connection
514 def disconnect(self, srv):
515 self.del_uuid(srv.nid_uuid)
516 if srv.net_type in ('tcp',) and not config.lctl_dump:
517 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
519 def del_uuid(self, uuid):
527 def disconnectAll(self, net):
535 def attach(self, type, name, uuid):
538 quit""" % (type, name, uuid)
541 def setup(self, name, setup = ""):
545 quit""" % (name, setup)
549 # create a new device with lctl
550 def newdev(self, type, name, uuid, setup = ""):
551 self.attach(type, name, uuid);
553 self.setup(name, setup)
554 except CommandError, e:
555 self.cleanup(name, uuid, 0)
560 def cleanup(self, name, uuid, force, failover = 0):
561 if failover: force = 1
567 quit""" % (name, ('', 'force')[force],
568 ('', 'failover')[failover])
572 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
573 stripe_sz, stripe_off,
577 lov_setup %s %d %d %d %s %s
578 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lmv_setup(self, name, uuid, desc_uuid, devlist):
586 quit""" % (name, uuid, desc_uuid, devlist)
589 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
593 lov_setconfig %s %d %d %d %s %s
594 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
598 def dump(self, dump_file):
601 quit""" % (dump_file)
604 # get list of devices
605 def device_list(self):
606 devices = '/proc/fs/lustre/devices'
608 if os.access(devices, os.R_OK):
610 fp = open(devices, 'r')
618 def lustre_version(self):
619 rc, out = self.runcmd('version')
623 def mount_option(self, profile, osc, mdc):
625 mount_option %s %s %s
626 quit""" % (profile, osc, mdc)
629 # delete mount options
630 def del_mount_option(self, profile):
636 def set_timeout(self, timeout):
642 # delete mount options
643 def set_lustre_upcall(self, upcall):
648 # ============================================================
649 # Various system-level functions
650 # (ideally moved to their own module)
652 # Run a command and return the output and status.
653 # stderr is sent to /dev/null, could use popen3 to
654 # save it if necessary
657 if config.noexec: return (0, [])
658 f = os.popen(cmd + ' 2>&1')
668 cmd = string.join(map(str,args))
671 # Run a command in the background.
672 def run_daemon(*args):
673 cmd = string.join(map(str,args))
675 if config.noexec: return 0
676 f = os.popen(cmd + ' 2>&1')
684 # Determine full path to use for an external command
685 # searches dirname(argv[0]) first, then PATH
687 syspath = string.split(os.environ['PATH'], ':')
688 cmdpath = os.path.dirname(sys.argv[0])
689 syspath.insert(0, cmdpath);
691 syspath.insert(0, os.path.join(config.portals, 'utils/'))
693 prog = os.path.join(d,cmd)
694 if os.access(prog, os.X_OK):
698 # Recursively look for file starting at base dir
699 def do_find_file(base, mod):
700 fullname = os.path.join(base, mod)
701 if os.access(fullname, os.R_OK):
703 for d in os.listdir(base):
704 dir = os.path.join(base,d)
705 if os.path.isdir(dir):
706 module = do_find_file(dir, mod)
710 def find_module(src_dir, dev_dir, modname):
711 modbase = src_dir +'/'+ dev_dir +'/'+ modname
712 for modext in '.ko', '.o':
713 module = modbase + modext
715 if os.access(module, os.R_OK):
721 # is the path a block device?
728 return stat.S_ISBLK(s[stat.ST_MODE])
730 # build fs according to type
732 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
738 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
740 # devsize is in 1k, and fs block count is in 4k
741 block_cnt = devsize/4
743 if fstype in ('ext3', 'extN', 'ldiskfs'):
744 # ext3 journal size is in megabytes
747 if not is_block(dev):
748 ret, out = runcmd("ls -l %s" %dev)
749 devsize = int(string.split(out[0])[4]) / 1024
751 ret, out = runcmd("sfdisk -s %s" %dev)
752 devsize = int(out[0])
753 if devsize > 1024 * 1024:
754 jsize = ((devsize / 102400) * 4)
757 if jsize: jopt = "-J size=%d" %(jsize,)
758 if isize: iopt = "-I %d" %(isize,)
759 mkfs = 'mkfs.ext2 -j -b 4096 '
760 if not isblock or config.force:
762 elif fstype == 'reiserfs':
763 # reiserfs journal size is in blocks
764 if jsize: jopt = "--journal_size %d" %(jsize,)
765 mkfs = 'mkreiserfs -ff'
767 panic('unsupported fs type: ', fstype)
769 if config.mkfsoptions != None:
770 mkfs = mkfs + ' ' + config.mkfsoptions
771 if mkfsoptions != None:
772 mkfs = mkfs + ' ' + mkfsoptions
773 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
775 panic("Unable to build fs:", dev, string.join(out))
776 # enable hash tree indexing on fsswe
777 if fstype in ('ext3', 'extN', 'ldiskfs'):
778 htree = 'echo "feature FEATURE_C5" | debugfs -w'
779 (ret, out) = run (htree, dev)
781 panic("Unable to enable htree:", dev)
783 # some systems use /dev/loopN, some /dev/loop/N
787 if not os.access(loop + str(0), os.R_OK):
789 if not os.access(loop + str(0), os.R_OK):
790 panic ("can't access loop devices")
793 # find loop device assigned to the file
794 def find_assigned_loop(file):
796 for n in xrange(0, MAX_LOOP_DEVICES):
798 if os.access(dev, os.R_OK):
799 (stat, out) = run('losetup', dev)
800 if out and stat == 0:
801 m = re.search(r'\((.*)\)', out[0])
802 if m and file == m.group(1):
808 # create file if necessary and assign the first free loop device
809 def init_loop(file, size, fstype, journal_size, inode_size,
810 mkfsoptions, reformat, autoformat, backfstype, backfile):
813 realfstype = backfstype
814 if is_block(backfile):
815 if reformat or (need_format(realfstype, backfile) and autoformat == 'yes'):
816 mkfs(realfile, size, realfstype, journal_size, inode_size, mkfsoptions, isblock=0)
822 dev = find_assigned_loop(realfile)
824 print 'WARNING file:', realfile, 'already mapped to', dev
827 if reformat or not os.access(realfile, os.R_OK | os.W_OK):
829 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (realfile, size))
830 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, realfile))
832 panic("Unable to create backing store:", realfile)
834 mkfs(realfile, size, realfstype, journal_size, inode_size,
835 mkfsoptions, isblock=0)
838 # find next free loop
839 for n in xrange(0, MAX_LOOP_DEVICES):
841 if os.access(dev, os.R_OK):
842 (stat, out) = run('losetup', dev)
844 run('losetup', dev, realfile)
847 print "out of loop devices"
849 print "out of loop devices"
852 # undo loop assignment
853 def clean_loop(file):
854 dev = find_assigned_loop(file)
856 ret, out = run('losetup -d', dev)
858 log('unable to clean loop device:', dev, 'for file:', file)
861 # determine if dev is formatted as a <fstype> filesystem
862 def need_format(fstype, dev):
863 # FIXME don't know how to implement this
866 # initialize a block device if needed
867 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
868 inode_size, mkfsoptions, backfstype, backdev):
872 if fstype == 'smfs' or not is_block(dev):
873 dev = init_loop(dev, size, fstype, journal_size, inode_size,
874 mkfsoptions, reformat, autoformat, backfstype, backdev)
875 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
876 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
879 # panic("device:", dev,
880 # "not prepared, and autoformat is not set.\n",
881 # "Rerun with --reformat option to format ALL filesystems")
886 """lookup IP address for an interface"""
887 rc, out = run("/sbin/ifconfig", iface)
890 addr = string.split(out[1])[1]
891 ip = string.split(addr, ':')[1]
894 def def_mount_options(fstype, target):
895 """returns deafult mount options for passed fstype and target (mds, ost)"""
896 if fstype == 'ext3' or fstype == 'ldiskfs':
897 mountfsoptions = "errors=remount-ro"
898 if target == 'ost' and sys_get_branch() == '2.4':
899 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
900 return mountfsoptions
903 def sys_get_elan_position_file():
904 procfiles = ["/proc/elan/device0/position",
905 "/proc/qsnet/elan4/device0/position",
906 "/proc/qsnet/elan3/device0/position"]
908 if os.access(p, os.R_OK):
912 def sys_get_local_nid(net_type, wildcard, cluster_id):
913 """Return the local nid."""
915 if sys_get_elan_position_file():
916 local = sys_get_local_address('elan', '*', cluster_id)
918 local = sys_get_local_address(net_type, wildcard, cluster_id)
921 def sys_get_local_address(net_type, wildcard, cluster_id):
922 """Return the local address for the network type."""
924 if net_type in ('tcp',):
926 iface, star = string.split(wildcard, ':')
927 local = if2addr(iface)
929 panic ("unable to determine ip for:", wildcard)
931 host = socket.gethostname()
932 local = socket.gethostbyname(host)
933 elif net_type == 'elan':
934 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
935 f = sys_get_elan_position_file()
937 panic ("unable to determine local Elan ID")
940 lines = fp.readlines()
948 nid = my_int(cluster_id) + my_int(elan_id)
950 except ValueError, e:
954 elif net_type == 'gm':
955 fixme("automatic local address for GM")
959 def sys_get_branch():
960 """Returns kernel release"""
962 fp = open('/proc/sys/kernel/osrelease')
963 lines = fp.readlines()
967 version = string.split(l)
968 a = string.split(version[0], '.')
969 return a[0] + '.' + a[1]
975 def mod_loaded(modname):
976 """Check if a module is already loaded. Look in /proc/modules for it."""
978 fp = open('/proc/modules')
979 lines = fp.readlines()
981 # please forgive my tired fingers for this one
982 ret = filter(lambda word, mod=modname: word == mod,
983 map(lambda line: string.split(line)[0], lines))
988 # XXX: instead of device_list, ask for $name and see what we get
989 def is_prepared(name):
990 """Return true if a device exists for the name"""
993 if (config.noexec or config.record) and config.cleanup:
996 # expect this format:
997 # 1 UP ldlm ldlm ldlm_UUID 2
998 out = lctl.device_list()
1000 if name == string.split(s)[3]:
1002 except CommandError, e:
1006 def is_network_prepared():
1007 """If the any device exists, then assume that all networking
1008 has been configured"""
1009 out = lctl.device_list()
1012 def fs_is_mounted(path):
1013 """Return true if path is a mounted lustre filesystem"""
1015 fp = open('/proc/mounts')
1016 lines = fp.readlines()
1020 if a[1] == path and a[2] == 'lustre_lite':
1028 """Manage kernel modules"""
1029 def __init__(self, lustre_dir, portals_dir):
1030 self.lustre_dir = lustre_dir
1031 self.portals_dir = portals_dir
1032 self.kmodule_list = []
1034 def add_portals_module(self, dev_dir, modname):
1035 """Append a module to list of modules to load."""
1036 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
1038 def add_lustre_module(self, dev_dir, modname):
1039 """Append a module to list of modules to load."""
1040 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
1042 def load_module(self):
1043 """Load all the modules in the list in the order they appear."""
1044 for src_dir, dev_dir, mod in self.kmodule_list:
1045 if mod_loaded(mod) and not config.noexec:
1047 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1049 module = find_module(src_dir, dev_dir, mod)
1051 panic('module not found:', mod)
1052 (rc, out) = run('/sbin/insmod', module)
1054 raise CommandError('insmod', out, rc)
1056 (rc, out) = run('/sbin/modprobe', mod)
1058 raise CommandError('modprobe', out, rc)
1060 def cleanup_module(self):
1061 """Unload the modules in the list in reverse order."""
1062 rev = self.kmodule_list
1064 for src_dir, dev_dir, mod in rev:
1065 if not mod_loaded(mod) and not config.noexec:
1068 if mod == 'portals' and config.dump:
1069 lctl.dump(config.dump)
1070 log('unloading module:', mod)
1071 (rc, out) = run('/sbin/rmmod', mod)
1073 log('! unable to unload module:', mod)
1076 # ============================================================
1077 # Classes to prepare and cleanup the various objects
1080 """ Base class for the rest of the modules. The default cleanup method is
1081 defined here, as well as some utilitiy funcs.
1083 def __init__(self, module_name, db):
1085 self.module_name = module_name
1086 self.name = self.db.getName()
1087 self.uuid = self.db.getUUID()
1090 self.kmod = kmod(config.lustre, config.portals)
1092 def info(self, *args):
1093 msg = string.join(map(str,args))
1094 print self.module_name + ":", self.name, self.uuid, msg
1097 """ default cleanup, used for most modules """
1100 lctl.cleanup(self.name, self.uuid, config.force)
1101 except CommandError, e:
1102 log(self.module_name, "cleanup failed: ", self.name)
1106 def add_portals_module(self, dev_dir, modname):
1107 """Append a module to list of modules to load."""
1108 self.kmod.add_portals_module(dev_dir, modname)
1110 def add_lustre_module(self, dev_dir, modname):
1111 """Append a module to list of modules to load."""
1112 self.kmod.add_lustre_module(dev_dir, modname)
1114 def load_module(self):
1115 """Load all the modules in the list in the order they appear."""
1116 self.kmod.load_module()
1118 def cleanup_module(self):
1119 """Unload the modules in the list in reverse order."""
1120 if self.safe_to_clean():
1121 self.kmod.cleanup_module()
1123 def safe_to_clean(self):
1126 def safe_to_clean_modules(self):
1127 return self.safe_to_clean()
1129 class Network(Module):
1130 def __init__(self,db):
1131 Module.__init__(self, 'NETWORK', db)
1132 self.net_type = self.db.get_val('nettype')
1133 self.nid = self.db.get_val('nid', '*')
1134 self.cluster_id = self.db.get_val('clusterid', "0")
1135 self.port = self.db.get_val_int('port', 0)
1136 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1137 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1138 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1141 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1143 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1144 self.generic_nid = 1
1145 debug("nid:", self.nid)
1147 self.generic_nid = 0
1149 self.nid_uuid = self.nid_to_uuid(self.nid)
1151 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1152 if '*' in self.hostaddr:
1153 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1154 if not self.hostaddr:
1155 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1156 debug("hostaddr:", self.hostaddr)
1158 self.add_portals_module("libcfs", 'libcfs')
1159 self.add_portals_module("portals", 'portals')
1160 if node_needs_router():
1161 self.add_portals_module("router", 'kptlrouter')
1162 if self.net_type == 'tcp':
1163 self.add_portals_module("knals/socknal", 'ksocknal')
1164 if self.net_type == 'elan':
1165 self.add_portals_module("knals/qswnal", 'kqswnal')
1166 if self.net_type == 'gm':
1167 self.add_portals_module("knals/gmnal", 'kgmnal')
1169 def nid_to_uuid(self, nid):
1170 return "NID_%s_UUID" %(nid,)
1173 if is_network_prepared():
1175 self.info(self.net_type, self.nid, self.port)
1176 if not (config.record and self.generic_nid):
1177 lctl.network(self.net_type, self.nid)
1178 if self.net_type == 'tcp':
1180 if self.net_type == 'elan':
1182 if self.port and node_is_router():
1183 run_one_acceptor(self.port)
1184 self.connect_peer_gateways()
1186 def connect_peer_gateways(self):
1187 for router in self.db.lookup_class('node'):
1188 if router.get_val_int('router', 0):
1189 for netuuid in router.get_networks():
1190 net = self.db.lookup(netuuid)
1192 if (gw.cluster_id == self.cluster_id and
1193 gw.net_type == self.net_type):
1194 if gw.nid != self.nid:
1197 def disconnect_peer_gateways(self):
1198 for router in self.db.lookup_class('node'):
1199 if router.get_val_int('router', 0):
1200 for netuuid in router.get_networks():
1201 net = self.db.lookup(netuuid)
1203 if (gw.cluster_id == self.cluster_id and
1204 gw.net_type == self.net_type):
1205 if gw.nid != self.nid:
1208 except CommandError, e:
1209 print "disconnect failed: ", self.name
1213 def safe_to_clean(self):
1214 return not is_network_prepared()
1217 self.info(self.net_type, self.nid, self.port)
1219 stop_acceptor(self.port)
1220 if node_is_router():
1221 self.disconnect_peer_gateways()
1223 def correct_level(self, level, op=None):
1226 class RouteTable(Module):
1227 def __init__(self,db):
1228 Module.__init__(self, 'ROUTES', db)
1230 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1232 # only setup connections for tcp NALs
1234 if not net_type in ('tcp',):
1237 # connect to target if route is to single node and this node is the gw
1238 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1239 if not local_cluster(net_type, tgt_cluster_id):
1240 panic("target", lo, " not on the local cluster")
1241 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1242 # connect to gateway if this node is not the gw
1243 elif (local_cluster(net_type, gw_cluster_id)
1244 and not local_interface(net_type, gw_cluster_id, gw)):
1245 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1250 panic("no server for nid", lo)
1253 return Network(srvdb)
1256 if is_network_prepared():
1259 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1260 lctl.add_route(net_type, gw, lo, hi)
1261 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1265 def safe_to_clean(self):
1266 return not is_network_prepared()
1269 if is_network_prepared():
1270 # the network is still being used, don't clean it up
1272 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1273 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1276 lctl.disconnect(srv)
1277 except CommandError, e:
1278 print "disconnect failed: ", self.name
1283 lctl.del_route(net_type, gw, lo, hi)
1284 except CommandError, e:
1285 print "del_route failed: ", self.name
1289 class Management(Module):
1290 def __init__(self, db):
1291 Module.__init__(self, 'MGMT', db)
1292 self.add_lustre_module('lvfs', 'lvfs')
1293 self.add_lustre_module('obdclass', 'obdclass')
1294 self.add_lustre_module('ptlrpc', 'ptlrpc')
1295 self.add_lustre_module('mgmt', 'mgmt_svc')
1298 if is_prepared(self.name):
1301 lctl.newdev("mgmt", self.name, self.uuid)
1303 def safe_to_clean(self):
1307 if is_prepared(self.name):
1308 Module.cleanup(self)
1310 def correct_level(self, level, op=None):
1313 # This is only needed to load the modules; the LDLM device
1314 # is now created automatically.
1316 def __init__(self,db):
1317 Module.__init__(self, 'LDLM', db)
1318 self.add_lustre_module('lvfs', 'lvfs')
1319 self.add_lustre_module('obdclass', 'obdclass')
1320 self.add_lustre_module('ptlrpc', 'ptlrpc')
1328 def correct_level(self, level, op=None):
1333 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1334 Module.__init__(self, 'LOV', db)
1335 if name_override != None:
1336 self.name = "lov_%s" % name_override
1337 self.add_lustre_module('lov', 'lov')
1338 self.mds_uuid = self.db.get_first_ref('mds')
1339 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1340 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1341 self.pattern = self.db.get_val_int('stripepattern', 0)
1342 self.devlist = self.db.get_refs('obd')
1343 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1345 self.desc_uuid = self.uuid
1346 self.uuid = generate_client_uuid(self.name)
1347 self.fs_name = fs_name
1349 self.config_only = 1
1351 self.config_only = None
1352 mds= self.db.lookup(self.mds_uuid)
1353 self.mds_name = mds.getName()
1354 for obd_uuid in self.devlist:
1355 obd = self.db.lookup(obd_uuid)
1356 osc = get_osc(obd, self.uuid, fs_name)
1358 self.osclist.append(osc)
1360 panic('osc not found:', obd_uuid)
1366 if is_prepared(self.name):
1368 if self.config_only:
1369 panic("Can't prepare config_only LOV ", self.name)
1371 for osc in self.osclist:
1373 # Only ignore connect failures with --force, which
1374 # isn't implemented here yet.
1375 osc.prepare(ignore_connect_failure=0)
1376 except CommandError, e:
1377 print "Error preparing OSC %s\n" % osc.uuid
1379 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1380 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1381 lctl.lov_setup(self.name, self.uuid,
1382 self.desc_uuid, self.mds_name, self.stripe_cnt,
1383 self.stripe_sz, self.stripe_off, self.pattern,
1384 string.join(self.devlist))
1387 if is_prepared(self.name):
1388 Module.cleanup(self)
1389 if self.config_only:
1390 panic("Can't clean up config_only LOV ", self.name)
1391 for osc in self.osclist:
1394 def load_module(self):
1395 if self.config_only:
1396 panic("Can't load modules for config_only LOV ", self.name)
1397 for osc in self.osclist:
1400 Module.load_module(self)
1402 def cleanup_module(self):
1403 if self.config_only:
1404 panic("Can't cleanup modules for config_only LOV ", self.name)
1405 Module.cleanup_module(self)
1406 for osc in self.osclist:
1407 osc.cleanup_module()
1410 def correct_level(self, level, op=None):
1414 def __init__(self, db, uuid, fs_name, name_override = None):
1415 Module.__init__(self, 'LMV', db)
1416 if name_override != None:
1417 self.name = "lmv_%s" % name_override
1418 self.add_lustre_module('lmv', 'lmv')
1419 self.devlist = self.db.get_refs('mds')
1421 self.desc_uuid = self.uuid
1423 self.fs_name = fs_name
1424 for mds_uuid in self.devlist:
1425 mds = self.db.lookup(mds_uuid)
1427 panic("MDS not found!")
1428 mdc = MDC(mds, self.uuid, fs_name)
1430 self.mdclist.append(mdc)
1432 panic('mdc not found:', mds_uuid)
1435 if is_prepared(self.name):
1437 for mdc in self.mdclist:
1439 # Only ignore connect failures with --force, which
1440 # isn't implemented here yet.
1441 mdc.prepare(ignore_connect_failure=0)
1442 except CommandError, e:
1443 print "Error preparing LMV %s\n" % mdc.uuid
1445 lctl.lmv_setup(self.name, self.uuid, self.desc_uuid,
1446 string.join(self.devlist))
1449 for mdc in self.mdclist:
1451 if is_prepared(self.name):
1452 Module.cleanup(self)
1454 def load_module(self):
1455 for mdc in self.mdclist:
1458 Module.load_module(self)
1460 def cleanup_module(self):
1461 Module.cleanup_module(self)
1462 for mds in self.mdclist:
1463 mdc.cleanup_module()
1466 def correct_level(self, level, op=None):
1469 class MDSDEV(Module):
1470 def __init__(self,db):
1471 Module.__init__(self, 'MDSDEV', db)
1472 self.devpath = self.db.get_val('devpath','')
1473 self.backdevpath = self.db.get_val('backdevpath','')
1474 self.size = self.db.get_val_int('devsize', 0)
1475 self.journal_size = self.db.get_val_int('journalsize', 0)
1476 self.fstype = self.db.get_val('fstype', '')
1477 self.backfstype = self.db.get_val('backfstype', '')
1478 self.nspath = self.db.get_val('nspath', '')
1479 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1480 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1481 self.cachetype = self.db.get_val('cachetype', '')
1482 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1483 target_uuid = self.db.get_first_ref('target')
1484 mds = self.db.lookup(target_uuid)
1485 self.name = mds.getName()
1486 self.filesystem_uuids = mds.get_refs('filesystem')
1489 self.master_mds = ""
1490 if not self.filesystem_uuids:
1491 self.lmv_uuid = self.db.get_first_ref('lmv')
1492 if not self.lmv_uuid:
1493 panic("ALERT: can't find lvm uuid")
1495 self.lmv = self.db.lookup(self.lmv_uuid)
1497 self.filesystem_uuids = self.lmv.get_refs('filesystem')
1498 self.master_mds = self.lmv_uuid
1499 # FIXME: if fstype not set, then determine based on kernel version
1500 self.format = self.db.get_val('autoformat', "no")
1501 if mds.get_val('failover', 0):
1502 self.failover_mds = 'f'
1504 self.failover_mds = 'n'
1505 active_uuid = get_active_target(mds)
1507 panic("No target device found:", target_uuid)
1508 if active_uuid == self.uuid:
1512 if self.active and config.group and config.group != mds.get_val('group'):
1516 self.inode_size = self.db.get_val_int('inodesize', 0)
1517 if self.inode_size == 0:
1518 # find the LOV for this MDS
1519 lovconfig_uuid = mds.get_first_ref('lovconfig')
1520 if not lovconfig_uuid:
1521 if not self.lmv_uuid:
1522 panic("No LOV found for lovconfig ", lovconfig.name)
1525 panic("No LMV initialized and not lovconfig_uuid found")
1527 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1528 lovconfig = self.lmv.lookup(lovconfig_uuid)
1529 lov_uuid = lovconfig.get_first_ref('lov')
1531 panic("No LOV found for lovconfig ", lovconfig.name)
1533 lovconfig = mds.lookup(lovconfig_uuid)
1534 lov_uuid = lovconfig.get_first_ref('lov')
1536 panic("No LOV found for lovconfig ", lovconfig.name)
1539 lovconfig_uuid = self.lmv.get_first_ref('lovconfig')
1540 lovconfig = self.lmv.lookup(lovconfig_uuid)
1541 lov_uuid = lovconfig.get_first_ref('lov')
1543 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1545 # default stripe count controls default inode_size
1546 stripe_count = lov.stripe_cnt
1547 if stripe_count > 77:
1548 self.inode_size = 4096
1549 elif stripe_count > 35:
1550 self.inode_size = 2048
1551 elif stripe_count > 13:
1552 self.inode_size = 1024
1553 elif stripe_count > 3:
1554 self.inode_size = 512
1556 self.inode_size = 256
1558 self.target_dev_uuid = self.uuid
1559 self.uuid = target_uuid
1562 client_uuid = generate_client_uuid(self.name)
1563 client_uuid = self.name + "_lmv_" + "UUID"
1564 self.master = LMV(self.db.lookup(self.lmv_uuid), client_uuid, self.name, self.name)
1565 self.master_mds = self.master.name
1568 self.add_lustre_module('mdc', 'mdc')
1569 self.add_lustre_module('osc', 'osc')
1570 self.add_lustre_module('lov', 'lov')
1571 self.add_lustre_module('lmv', 'lmv')
1572 self.add_lustre_module('ost', 'ost')
1573 self.add_lustre_module('mds', 'mds')
1575 if self.fstype == 'smfs':
1576 self.add_lustre_module('smfs', 'smfs')
1578 if self.fstype == 'ldiskfs':
1579 self.add_lustre_module('ldiskfs', 'ldiskfs')
1582 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1584 # if fstype is smfs, then we should also take care about backing
1586 if self.fstype == 'smfs':
1587 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.backfstype))
1589 def load_module(self):
1591 Module.load_module(self)
1594 if is_prepared(self.name):
1597 debug(self.uuid, "not active")
1600 # run write_conf automatically, if --reformat used
1602 self.info(self.devpath, self.fstype, self.size, self.format)
1606 self.master.prepare()
1607 # never reformat here
1608 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1609 self.format, self.journal_size, self.inode_size,
1610 self.mkfsoptions, self.backfstype, self.backdevpath)
1612 if not is_prepared('MDT'):
1613 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1615 mountfsoptions = def_mount_options(self.fstype, 'mds')
1617 if config.mountfsoptions:
1619 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1621 mountfsoptions = config.mountfsoptions
1622 if self.mountfsoptions:
1623 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1625 if self.mountfsoptions:
1627 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1629 mountfsoptions = self.mountfsoptions
1631 if self.fstype == 'smfs':
1632 realdev = self.fstype
1635 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1639 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1644 print 'MDS mount options: ' + mountfsoptions
1646 if not self.master_mds:
1647 self.master_mds = 'dumb'
1648 if not self.cachetype:
1649 self.cachetype = 'dumb'
1650 lctl.newdev("mds", self.name, self.uuid,
1651 setup ="%s %s %s %s %s %s" %(realdev, self.fstype,
1652 self.name, mountfsoptions,
1653 self.master_mds, self.cachetype))
1654 except CommandError, e:
1656 panic("MDS is missing the config log. Need to run " +
1657 "lconf --write_conf.")
1661 def write_conf(self):
1662 if is_prepared(self.name):
1664 self.info(self.devpath, self.fstype, self.format)
1666 blkdev = block_dev(self.devpath, self.size, self.fstype,
1667 config.reformat, self.format, self.journal_size,
1668 self.inode_size, self.mkfsoptions, self.backfstype,
1671 # Even for writing logs we mount mds with supplied mount options
1672 # because it will not mount smfs (if used) otherwise.
1674 mountfsoptions = def_mount_options(self.fstype, 'mds')
1676 if config.mountfsoptions:
1678 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1680 mountfsoptions = config.mountfsoptions
1681 if self.mountfsoptions:
1682 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1684 if self.mountfsoptions:
1686 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1688 mountfsoptions = self.mountfsoptions
1690 if self.fstype == 'smfs':
1691 realdev = self.fstype
1694 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1698 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1703 print 'MDS mount options: ' + mountfsoptions
1705 # As mount options are passed by 4th param to config tool, we need
1706 # to pass something in 3rd param. But we do not want this 3rd param
1707 # be counted as a profile name for reading log on MDS setup, thus,
1708 # we pass there some predefined sign like 'dumb', which will be
1709 # checked in MDS code and skipped. Probably there is more nice way
1710 # like pass empty string and check it in config tool and pass null
1712 lctl.newdev("mds", self.name, self.uuid,
1713 setup ="%s %s %s %s" %(realdev, self.fstype,
1714 'dumb', mountfsoptions))
1715 # record logs for the MDS lov
1716 for uuid in self.filesystem_uuids:
1717 log("recording clients for filesystem:", uuid)
1718 fs = self.db.lookup(uuid)
1720 # this is ugly, should be organized nice later.
1721 target_uuid = self.db.get_first_ref('target')
1722 mds = self.db.lookup(target_uuid)
1724 lovconfig_uuid = mds.get_first_ref('lovconfig')
1726 lovconfig = mds.lookup(lovconfig_uuid)
1727 obd_uuid = lovconfig.get_first_ref('lov')
1729 obd_uuid = fs.get_first_ref('obd')
1731 client_uuid = generate_client_uuid(self.name)
1732 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1735 lctl.clear_log(self.name, self.name)
1736 lctl.record(self.name, self.name)
1738 lctl.mount_option(self.name, client.get_name(), "")
1742 lctl.clear_log(self.name, self.name + '-clean')
1743 lctl.record(self.name, self.name + '-clean')
1745 lctl.del_mount_option(self.name)
1750 # record logs for each client
1752 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1754 config_options = CONFIG_FILE
1756 for node_db in self.db.lookup_class('node'):
1757 client_name = node_db.getName()
1758 for prof_uuid in node_db.get_refs('profile'):
1759 prof_db = node_db.lookup(prof_uuid)
1760 # refactor this into a funtion to test "clientness"
1762 for ref_class, ref_uuid in prof_db.get_all_refs():
1763 if ref_class in ('mountpoint','echoclient'):
1764 debug("recording", client_name)
1765 old_noexec = config.noexec
1767 noexec_opt = ('', '-n')
1768 ret, out = run (sys.argv[0],
1769 noexec_opt[old_noexec == 1],
1770 " -v --record --nomod",
1771 "--record_log", client_name,
1772 "--record_device", self.name,
1773 "--node", client_name,
1776 for s in out: log("record> ", string.strip(s))
1777 ret, out = run (sys.argv[0],
1778 noexec_opt[old_noexec == 1],
1779 "--cleanup -v --record --nomod",
1780 "--record_log", client_name + "-clean",
1781 "--record_device", self.name,
1782 "--node", client_name,
1785 for s in out: log("record> ", string.strip(s))
1786 config.noexec = old_noexec
1788 lctl.cleanup(self.name, self.uuid, 0, 0)
1789 except CommandError, e:
1790 log(self.module_name, "cleanup failed: ", self.name)
1793 Module.cleanup(self)
1795 if self.fstype == 'smfs':
1796 clean_loop(self.backdevpath)
1798 clean_loop(self.devpath)
1800 def msd_remaining(self):
1801 out = lctl.device_list()
1803 if string.split(s)[2] in ('mds',):
1806 def safe_to_clean(self):
1809 def safe_to_clean_modules(self):
1810 return not self.msd_remaining()
1814 debug(self.uuid, "not active")
1817 if is_prepared(self.name):
1819 lctl.cleanup(self.name, self.uuid, config.force,
1821 except CommandError, e:
1822 log(self.module_name, "cleanup failed: ", self.name)
1825 Module.cleanup(self)
1828 self.master.cleanup()
1829 if not self.msd_remaining() and is_prepared('MDT'):
1831 lctl.cleanup("MDT", "MDT_UUID", config.force,
1833 except CommandError, e:
1834 print "cleanup failed: ", self.name
1838 if self.fstype == 'smfs':
1839 clean_loop(self.backdevpath)
1841 clean_loop(self.devpath)
1843 def correct_level(self, level, op=None):
1844 #if self.master_mds:
1849 def __init__(self, db):
1850 Module.__init__(self, 'OSD', db)
1851 self.osdtype = self.db.get_val('osdtype')
1852 self.devpath = self.db.get_val('devpath', '')
1853 self.backdevpath = self.db.get_val('backdevpath', '')
1854 self.size = self.db.get_val_int('devsize', 0)
1855 self.journal_size = self.db.get_val_int('journalsize', 0)
1856 self.inode_size = self.db.get_val_int('inodesize', 0)
1857 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1858 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1859 self.fstype = self.db.get_val('fstype', '')
1860 self.backfstype = self.db.get_val('backfstype', '')
1861 self.nspath = self.db.get_val('nspath', '')
1862 target_uuid = self.db.get_first_ref('target')
1863 ost = self.db.lookup(target_uuid)
1864 self.name = ost.getName()
1865 self.format = self.db.get_val('autoformat', 'yes')
1866 if ost.get_val('failover', 0):
1867 self.failover_ost = 'f'
1869 self.failover_ost = 'n'
1871 active_uuid = get_active_target(ost)
1873 panic("No target device found:", target_uuid)
1874 if active_uuid == self.uuid:
1878 if self.active and config.group and config.group != ost.get_val('group'):
1881 self.target_dev_uuid = self.uuid
1882 self.uuid = target_uuid
1884 self.add_lustre_module('ost', 'ost')
1885 if self.fstype == 'smfs':
1886 self.add_lustre_module('smfs', 'smfs')
1887 # FIXME: should we default to ext3 here?
1888 if self.fstype == 'ldiskfs':
1889 self.add_lustre_module('ldiskfs', 'ldiskfs')
1891 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1892 if self.fstype == 'smfs':
1893 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.backfstype))
1895 self.add_lustre_module(self.osdtype, self.osdtype)
1897 def load_module(self):
1899 Module.load_module(self)
1901 # need to check /proc/mounts and /etc/mtab before
1902 # formatting anything.
1903 # FIXME: check if device is already formatted.
1905 if is_prepared(self.name):
1908 debug(self.uuid, "not active")
1910 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1911 self.format, self.journal_size, self.inode_size)
1913 if self.osdtype == 'obdecho':
1916 blkdev = block_dev(self.devpath, self.size, self.fstype,
1917 config.reformat, self.format, self.journal_size,
1918 self.inode_size, self.mkfsoptions, self.backfstype,
1921 mountfsoptions = def_mount_options(self.fstype, 'ost')
1923 if config.mountfsoptions:
1925 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1927 mountfsoptions = config.mountfsoptions
1928 if self.mountfsoptions:
1929 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1931 if self.mountfsoptions:
1933 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1935 mountfsoptions = self.mountfsoptions
1937 if self.fstype == 'smfs':
1938 realdev = self.fstype
1941 mountfsoptions = "%s,type=%s,dev=%s" % (mountfsoptions,
1945 mountfsoptions = "type=%s,dev=%s" % (self.backfstype,
1950 print 'OSD mount options: ' + mountfsoptions
1952 lctl.newdev(self.osdtype, self.name, self.uuid,
1953 setup ="%s %s %s %s" %(realdev, self.fstype,
1956 if not is_prepared('OSS'):
1957 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1959 def osd_remaining(self):
1960 out = lctl.device_list()
1962 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1965 def safe_to_clean(self):
1968 def safe_to_clean_modules(self):
1969 return not self.osd_remaining()
1973 debug(self.uuid, "not active")
1975 if is_prepared(self.name):
1978 lctl.cleanup(self.name, self.uuid, config.force,
1980 except CommandError, e:
1981 log(self.module_name, "cleanup failed: ", self.name)
1984 if not self.osd_remaining() and is_prepared('OSS'):
1986 lctl.cleanup("OSS", "OSS_UUID", config.force,
1988 except CommandError, e:
1989 print "cleanup failed: ", self.name
1992 if not self.osdtype == 'obdecho':
1993 if self.fstype == 'smfs':
1994 clean_loop(self.backdevpath)
1996 clean_loop(self.devpath)
1998 def correct_level(self, level, op=None):
2001 def mgmt_uuid_for_fs(mtpt_name):
2004 mtpt_db = toplustreDB.lookup_name(mtpt_name)
2005 fs_uuid = mtpt_db.get_first_ref('filesystem')
2006 fs = toplustreDB.lookup(fs_uuid)
2009 return fs.get_first_ref('mgmt')
2011 # Generic client module, used by OSC and MDC
2012 class Client(Module):
2013 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
2015 self.target_name = tgtdb.getName()
2016 self.target_uuid = tgtdb.getUUID()
2019 self.tgt_dev_uuid = get_active_target(tgtdb)
2020 if not self.tgt_dev_uuid:
2021 panic("No target device found for target(1):", self.target_name)
2023 self.kmod = kmod(config.lustre, config.portals)
2027 self.module = module
2028 self.module_name = string.upper(module)
2030 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
2031 self.target_name, fs_name)
2033 self.name = self_name
2035 self.lookup_server(self.tgt_dev_uuid)
2036 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
2038 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
2041 self.fs_name = fs_name
2044 self.add_lustre_module(module_dir, module)
2046 def lookup_server(self, srv_uuid):
2047 """ Lookup a server's network information """
2048 self._server_nets = get_ost_net(self.db, srv_uuid)
2049 if len(self._server_nets) == 0:
2050 panic ("Unable to find a server for:", srv_uuid)
2053 def get_servers(self):
2054 return self._server_nets
2056 def prepare(self, ignore_connect_failure = 0):
2057 self.info(self.target_uuid)
2058 if is_prepared(self.name):
2061 srv = choose_local_server(self.get_servers())
2065 routes = find_route(self.get_servers())
2066 if len(routes) == 0:
2067 panic ("no route to", self.target_uuid)
2068 for (srv, r) in routes:
2069 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
2070 except CommandError, e:
2071 if not ignore_connect_failure:
2074 if self.target_uuid in config.inactive and self.permits_inactive():
2075 debug("%s inactive" % self.target_uuid)
2076 inactive_p = "inactive"
2078 debug("%s active" % self.target_uuid)
2080 lctl.newdev(self.module, self.name, self.uuid,
2081 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
2082 inactive_p, self.mgmt_name))
2085 if is_prepared(self.name):
2086 Module.cleanup(self)
2088 srv = choose_local_server(self.get_servers())
2090 lctl.disconnect(srv)
2092 for (srv, r) in find_route(self.get_servers()):
2093 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
2094 except CommandError, e:
2095 log(self.module_name, "cleanup failed: ", self.name)
2099 def correct_level(self, level, op=None):
2104 def __init__(self, db, uuid, fs_name):
2105 Client.__init__(self, db, uuid, 'mdc', fs_name)
2107 def permits_inactive(self):
2111 def __init__(self, db, uuid, fs_name):
2112 Client.__init__(self, db, uuid, 'osc', fs_name)
2114 def permits_inactive(self):
2117 def mgmtcli_name_for_uuid(uuid):
2118 return 'MGMTCLI_%s' % uuid
2120 class ManagementClient(Client):
2121 def __init__(self, db, uuid):
2122 Client.__init__(self, db, uuid, 'mgmt_cli', '',
2123 self_name = mgmtcli_name_for_uuid(db.getUUID()),
2124 module_dir = 'mgmt')
2127 def __init__(self, db, uuid, name, type, name_override = None):
2128 Module.__init__(self, 'COBD', db)
2129 self.name = self.db.getName();
2130 self.uuid = generate_client_uuid(self.name)
2131 self.real_uuid = self.db.get_first_ref('realobd')
2132 self.cache_uuid = self.db.get_first_ref('cacheobd')
2133 self.add_lustre_module('cobd', 'cobd')
2134 real_obd = self.db.lookup(self.real_uuid)
2136 panic('real obd not found:', self.real_uuid)
2137 cache_obd = self.db.lookup(self.cache_uuid)
2139 panic('cache obd not found:', self.cache_uuid)
2141 self.real = LOV(real_obd, self.real_uuid, name,
2142 "%s_real" % (self.name));
2143 self.cache = LOV(cache_obd, self.cache_uuid, name,
2144 "%s_cache" % (self.name));
2146 self.real = get_mdc(db, uuid, name, self.real_uuid)
2147 self.cache = get_mdc(db, uuid, name, self.cache_uuid)
2148 # need to check /proc/mounts and /etc/mtab before
2149 # formatting anything.
2150 # FIXME: check if device is already formatted.
2155 def get_real_name(self):
2156 return self.real.name
2157 def get_cache_name(self):
2158 return self.cache.name
2161 self.cache.prepare()
2162 if is_prepared(self.name):
2164 self.info(self.real_uuid, self.cache_uuid)
2165 lctl.newdev("cobd", self.name, self.uuid,
2166 setup ="%s %s" %(self.real.name,
2169 if is_prepared(self.name):
2170 Module.cleanup(self)
2172 self.cache.cleanup()
2173 def load_module(self):
2174 self.real.load_module()
2175 Module.load_module(self)
2176 def cleanup_module(self):
2177 Module.cleanup_module(self)
2178 self.real.cleanup_module()
2180 # virtual interface for OSC and LOV
2182 def __init__(self, db, client_uuid, name, name_override = None):
2183 Module.__init__(self, 'VOSC', db)
2184 if db.get_class() == 'lov':
2185 self.osc = LOV(db, client_uuid, name, name_override)
2187 elif db.get_class() == 'cobd':
2188 self.osc = COBD(db, client_uuid, name, 'obd')
2191 return self.osc.get_uuid()
2193 return self.osc.get_name()
2198 def load_module(self):
2199 self.osc.load_module()
2200 def cleanup_module(self):
2201 self.osc.cleanup_module()
2202 def correct_level(self, level, op=None):
2203 return self.osc.correct_level(level, op)
2205 # virtual interface for MDC and LMV
2207 def __init__(self, db, uuid, fs_name, name_override = None):
2208 Module.__init__(self, 'VMDC', db)
2209 if db.get_class() == 'lmv':
2210 self.mdc = LMV(db, uuid, fs_name)
2211 elif db.get_class() == 'cobd':
2212 self.mdc = COBD(db, uuid, fs_name, 'mds')
2214 self.mdc = MDC(db, uuid, fs_name)
2216 return self.mdc.uuid
2218 return self.mdc.name
2223 def load_module(self):
2224 self.mdc.load_module()
2225 def cleanup_module(self):
2226 self.mdc.cleanup_module()
2227 def correct_level(self, level, op=None):
2228 return self.mdc.correct_level(level, op)
2230 class ECHO_CLIENT(Module):
2231 def __init__(self,db):
2232 Module.__init__(self, 'ECHO_CLIENT', db)
2233 self.add_lustre_module('obdecho', 'obdecho')
2234 self.obd_uuid = self.db.get_first_ref('obd')
2235 obd = self.db.lookup(self.obd_uuid)
2236 self.uuid = generate_client_uuid(self.name)
2237 self.osc = VOSC(obd, self.uuid, self.name)
2240 if is_prepared(self.name):
2243 self.osc.prepare() # XXX This is so cheating. -p
2244 self.info(self.obd_uuid)
2246 lctl.newdev("echo_client", self.name, self.uuid,
2247 setup = self.osc.get_name())
2250 if is_prepared(self.name):
2251 Module.cleanup(self)
2254 def load_module(self):
2255 self.osc.load_module()
2256 Module.load_module(self)
2258 def cleanup_module(self):
2259 Module.cleanup_module(self)
2260 self.osc.cleanup_module()
2262 def correct_level(self, level, op=None):
2265 def generate_client_uuid(name):
2266 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
2268 int(random.random() * 1048576),
2269 int(random.random() * 1048576))
2270 return client_uuid[:36]
2273 class Mountpoint(Module):
2274 def __init__(self,db):
2275 Module.__init__(self, 'MTPT', db)
2276 self.path = self.db.get_val('path')
2277 self.fs_uuid = self.db.get_first_ref('filesystem')
2278 fs = self.db.lookup(self.fs_uuid)
2279 self.mds_uuid = fs.get_first_ref('lmv')
2280 if not self.mds_uuid:
2281 self.mds_uuid = fs.get_first_ref('mds')
2282 self.obd_uuid = fs.get_first_ref('obd')
2283 self.mgmt_uuid = fs.get_first_ref('mgmt')
2284 client_uuid = generate_client_uuid(self.name)
2286 ost = self.db.lookup(self.obd_uuid)
2287 self.vosc = VOSC(ost, client_uuid, self.name)
2289 self.mds = self.db.lookup(self.mds_uuid)
2291 panic("no mds: ", self.mds_uuid)
2293 self.add_lustre_module('mdc', 'mdc')
2294 self.add_lustre_module('lmv', 'lmv')
2295 self.vmdc = VMDC(self.mds, client_uuid, self.name, self.mds_uuid)
2296 self.mdc = self.vmdc.mdc
2297 self.add_lustre_module('llite', 'llite')
2299 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
2305 if fs_is_mounted(self.path):
2306 log(self.path, "already mounted.")
2310 self.mgmtcli.prepare()
2313 vmdc_name = self.vmdc.get_name()
2315 self.info(self.path, self.mds_uuid, self.obd_uuid)
2316 if config.record or config.lctl_dump:
2317 lctl.mount_option(local_node_name, self.vosc.get_name(), vmdc_name)
2319 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
2320 (self.vosc.get_name(), vmdc_name, config.config, self.path)
2321 run("mkdir", self.path)
2326 panic("mount failed:", self.path, ":", string.join(val))
2329 self.info(self.path, self.mds_uuid,self.obd_uuid)
2331 if config.record or config.lctl_dump:
2332 lctl.del_mount_option(local_node_name)
2334 if fs_is_mounted(self.path):
2336 (rc, out) = run("umount", "-f", self.path)
2338 (rc, out) = run("umount", self.path)
2340 raise CommandError('umount', out, rc)
2342 if fs_is_mounted(self.path):
2343 panic("fs is still mounted:", self.path)
2348 self.mgmtcli.cleanup()
2350 def load_module(self):
2352 self.mgmtcli.load_module()
2353 self.vosc.load_module()
2354 Module.load_module(self)
2356 def cleanup_module(self):
2357 Module.cleanup_module(self)
2358 self.vosc.cleanup_module()
2360 self.mgmtcli.cleanup_module()
2362 def correct_level(self, level, op=None):
2365 # ============================================================
2366 # misc query functions
2368 def get_ost_net(self, osd_uuid):
2372 osd = self.lookup(osd_uuid)
2373 node_uuid = osd.get_first_ref('node')
2374 node = self.lookup(node_uuid)
2376 panic("unable to find node for osd_uuid:", osd_uuid,
2377 " node_ref:", node_uuid_)
2378 for net_uuid in node.get_networks():
2379 db = node.lookup(net_uuid)
2380 srv_list.append(Network(db))
2384 # the order of iniitailization is based on level.
2385 def getServiceLevel(self):
2386 type = self.get_class()
2388 if type in ('network',):
2390 elif type in ('routetbl',):
2392 elif type in ('ldlm',):
2394 elif type in ('mgmt',):
2396 elif type in ('osd', 'cobd'):
2398 elif type in ('mdsdev',):
2400 elif type in ('lmv',):
2402 elif type in ('mountpoint', 'echoclient'):
2405 panic("Unknown type: ", type)
2407 if ret < config.minlevel or ret > config.maxlevel:
2412 # return list of services in a profile. list is a list of tuples
2413 # [(level, db_object),]
2414 def getServices(self):
2416 for ref_class, ref_uuid in self.get_all_refs():
2417 servdb = self.lookup(ref_uuid)
2419 level = getServiceLevel(servdb)
2421 list.append((level, servdb))
2423 panic('service not found: ' + ref_uuid)
2429 ############################################################
2431 # FIXME: clean this mess up!
2433 # OSC is no longer in the xml, so we have to fake it.
2434 # this is getting ugly and begging for another refactoring
2435 def get_osc(ost_db, uuid, fs_name):
2436 osc = OSC(ost_db, uuid, fs_name)
2439 def get_mdc(db, uuid, fs_name, mds_uuid):
2440 mds_db = db.lookup(mds_uuid);
2442 error("no mds:", mds_uuid)
2443 mdc = MDC(mds_db, mds_uuid, fs_name)
2446 ############################################################
2447 # routing ("rooting")
2448 # list of (nettype, cluster_id, nid)
2451 def find_local_clusters(node_db):
2452 global local_clusters
2453 for netuuid in node_db.get_networks():
2454 net = node_db.lookup(netuuid)
2456 debug("add_local", netuuid)
2457 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2459 if acceptors.has_key(srv.port):
2460 panic("duplicate port:", srv.port)
2461 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2462 srv.send_mem, srv.recv_mem,
2465 # This node is a gateway.
2467 def node_is_router():
2470 # If there are any routers found in the config, then this will be true
2471 # and all nodes will load kptlrouter.
2473 def node_needs_router():
2474 return needs_router or is_router
2476 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2477 # Currently, these local routes are only added to kptlrouter route
2478 # table if they are needed to connect to a specific server. This
2479 # should be changed so all available routes are loaded, and the
2480 # ptlrouter can make all the decisions.
2483 def find_local_routes(lustre):
2484 """ Scan the lustre config looking for routers . Build list of
2486 global local_routes, needs_router
2488 list = lustre.lookup_class('node')
2490 if router.get_val_int('router', 0):
2492 for (local_type, local_cluster_id, local_nid) in local_clusters:
2494 for netuuid in router.get_networks():
2495 db = router.lookup(netuuid)
2496 if (local_type == db.get_val('nettype') and
2497 local_cluster_id == db.get_val('clusterid')):
2498 gw = db.get_val('nid')
2501 debug("find_local_routes: gw is", gw)
2502 for route in router.get_local_routes(local_type, gw):
2503 local_routes.append(route)
2504 debug("find_local_routes:", local_routes)
2507 def choose_local_server(srv_list):
2508 for srv in srv_list:
2509 if local_cluster(srv.net_type, srv.cluster_id):
2512 def local_cluster(net_type, cluster_id):
2513 for cluster in local_clusters:
2514 if net_type == cluster[0] and cluster_id == cluster[1]:
2518 def local_interface(net_type, cluster_id, nid):
2519 for cluster in local_clusters:
2520 if (net_type == cluster[0] and cluster_id == cluster[1]
2521 and nid == cluster[2]):
2525 def find_route(srv_list):
2527 frm_type = local_clusters[0][0]
2528 for srv in srv_list:
2529 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2530 to_type = srv.net_type
2532 cluster_id = srv.cluster_id
2533 debug ('looking for route to', to_type, to)
2534 for r in local_routes:
2535 debug("find_route: ", r)
2536 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2537 result.append((srv, r))
2540 def get_active_target(db):
2541 target_uuid = db.getUUID()
2542 target_name = db.getName()
2543 node_name = get_select(target_name)
2545 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2547 tgt_dev_uuid = db.get_first_ref('active')
2550 def get_server_by_nid_uuid(db, nid_uuid):
2551 for n in db.lookup_class("network"):
2553 if net.nid_uuid == nid_uuid:
2557 ############################################################
2561 type = db.get_class()
2562 debug('Service:', type, db.getName(), db.getUUID())
2567 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2568 elif type == 'network':
2570 elif type == 'routetbl':
2574 elif type == 'cobd':
2575 n = COBD(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2576 elif type == 'mdsdev':
2578 elif type == 'mountpoint':
2580 elif type == 'echoclient':
2582 elif type == 'mgmt':
2587 panic ("unknown service type:", type)
2591 # Prepare the system to run lustre using a particular profile
2592 # in a the configuration.
2593 # * load & the modules
2594 # * setup networking for the current node
2595 # * make sure partitions are in place and prepared
2596 # * initialize devices with lctl
2597 # Levels is important, and needs to be enforced.
2598 def for_each_profile(db, prof_list, operation):
2599 for prof_uuid in prof_list:
2600 prof_db = db.lookup(prof_uuid)
2602 panic("profile:", profile, "not found.")
2603 services = getServices(prof_db)
2606 def doWriteconf(services):
2610 if s[1].get_class() == 'mdsdev':
2611 n = newService(s[1])
2614 def doSetup(services):
2619 n = newService(s[1])
2621 slist.append((n.level, n))
2624 nl = n[1].correct_level(n[0])
2625 nlist.append((nl, n[1]))
2630 def doModules(services):
2634 n = newService(s[1])
2637 def doCleanup(services):
2642 n = newService(s[1])
2644 slist.append((n.level, n))
2647 nl = n[1].correct_level(n[0])
2648 nlist.append((nl, n[1]))
2652 if n[1].safe_to_clean():
2655 def doUnloadModules(services):
2660 n = newService(s[1])
2661 if n.safe_to_clean_modules():
2666 def doHost(lustreDB, hosts):
2667 global is_router, local_node_name
2670 node_db = lustreDB.lookup_name(h, 'node')
2674 panic('No host entry found.')
2676 local_node_name = node_db.get_val('name', 0)
2677 is_router = node_db.get_val_int('router', 0)
2678 lustre_upcall = node_db.get_val('lustreUpcall', '')
2679 portals_upcall = node_db.get_val('portalsUpcall', '')
2680 timeout = node_db.get_val_int('timeout', 0)
2681 ptldebug = node_db.get_val('ptldebug', '')
2682 subsystem = node_db.get_val('subsystem', '')
2684 find_local_clusters(node_db)
2686 find_local_routes(lustreDB)
2688 # Two step process: (1) load modules, (2) setup lustre
2689 # if not cleaning, load modules first.
2690 prof_list = node_db.get_refs('profile')
2692 if config.write_conf:
2694 for_each_profile(node_db, prof_list, doModules)
2696 for_each_profile(node_db, prof_list, doWriteconf)
2697 for_each_profile(node_db, prof_list, doUnloadModules)
2699 elif config.recover:
2700 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2701 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2702 "--client_uuid <UUID> --conn_uuid <UUID>")
2703 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2705 elif config.cleanup:
2707 # the command line can override this value
2709 # ugly hack, only need to run lctl commands for --dump
2710 if config.lctl_dump or config.record:
2711 for_each_profile(node_db, prof_list, doCleanup)
2714 sys_set_timeout(timeout)
2715 sys_set_ptldebug(ptldebug)
2716 sys_set_subsystem(subsystem)
2717 sys_set_lustre_upcall(lustre_upcall)
2718 sys_set_portals_upcall(portals_upcall)
2720 for_each_profile(node_db, prof_list, doCleanup)
2721 for_each_profile(node_db, prof_list, doUnloadModules)
2725 # ugly hack, only need to run lctl commands for --dump
2726 if config.lctl_dump or config.record:
2727 sys_set_timeout(timeout)
2728 sys_set_lustre_upcall(lustre_upcall)
2729 for_each_profile(node_db, prof_list, doSetup)
2733 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2734 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2736 for_each_profile(node_db, prof_list, doModules)
2738 sys_set_debug_path()
2739 sys_set_ptldebug(ptldebug)
2740 sys_set_subsystem(subsystem)
2741 script = config.gdb_script
2742 run(lctl.lctl, ' modules >', script)
2744 log ("The GDB module script is in", script)
2745 # pause, so user has time to break and
2748 sys_set_timeout(timeout)
2749 sys_set_lustre_upcall(lustre_upcall)
2750 sys_set_portals_upcall(portals_upcall)
2752 for_each_profile(node_db, prof_list, doSetup)
2755 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2756 tgt = lustreDB.lookup(tgt_uuid)
2758 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2759 new_uuid = get_active_target(tgt)
2761 raise Lustre.LconfError("doRecovery: no active target found for: " +
2763 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2765 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2767 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2769 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2772 lctl.disconnect(oldnet)
2773 except CommandError, e:
2774 log("recover: disconnect", nid_uuid, "failed: ")
2779 except CommandError, e:
2780 log("recover: connect failed")
2783 lctl.recover(client_uuid, net.nid_uuid)
2786 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2787 base = os.path.dirname(cmd)
2788 if development_mode():
2789 if not config.lustre:
2790 debug('using objdir module paths')
2791 config.lustre = (os.path.join(base, ".."))
2792 # normalize the portals dir, using command line arg if set
2794 portals_dir = config.portals
2795 dir = os.path.join(config.lustre, portals_dir)
2796 config.portals = dir
2797 debug('config.portals', config.portals)
2798 elif config.lustre and config.portals:
2800 # if --lustre and --portals, normalize portals
2801 # can ignore POTRALS_DIR here, since it is probly useless here
2802 config.portals = os.path.join(config.lustre, config.portals)
2803 debug('config.portals B', config.portals)
2805 def sysctl(path, val):
2806 debug("+ sysctl", path, val)
2810 fp = open(os.path.join('/proc/sys', path), 'w')
2817 def sys_set_debug_path():
2818 sysctl('portals/debug_path', config.debug_path)
2820 def sys_set_lustre_upcall(upcall):
2821 # the command overrides the value in the node config
2822 if config.lustre_upcall:
2823 upcall = config.lustre_upcall
2825 upcall = config.upcall
2827 lctl.set_lustre_upcall(upcall)
2829 def sys_set_portals_upcall(upcall):
2830 # the command overrides the value in the node config
2831 if config.portals_upcall:
2832 upcall = config.portals_upcall
2834 upcall = config.upcall
2836 sysctl('portals/upcall', upcall)
2838 def sys_set_timeout(timeout):
2839 # the command overrides the value in the node config
2840 if config.timeout and config.timeout > 0:
2841 timeout = config.timeout
2842 if timeout != None and timeout > 0:
2843 lctl.set_timeout(timeout)
2845 def sys_tweak_socknal ():
2846 if config.single_socket:
2847 sysctl("socknal/typed", 0)
2849 def sys_optimize_elan ():
2850 procfiles = ["/proc/elan/config/eventint_punt_loops",
2851 "/proc/qsnet/elan3/config/eventint_punt_loops",
2852 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2854 if os.access(p, os.R_OK):
2855 run ("echo 1 > " + p)
2857 def sys_set_ptldebug(ptldebug):
2859 ptldebug = config.ptldebug
2862 val = eval(ptldebug, ptldebug_names)
2863 val = "0x%x" % (val)
2864 sysctl('portals/debug', val)
2865 except NameError, e:
2868 def sys_set_subsystem(subsystem):
2869 if config.subsystem:
2870 subsystem = config.subsystem
2873 val = eval(subsystem, subsystem_names)
2874 val = "0x%x" % (val)
2875 sysctl('portals/subsystem_debug', val)
2876 except NameError, e:
2879 def sys_set_netmem_max(path, max):
2880 debug("setting", path, "to at least", max)
2888 fp = open(path, 'w')
2889 fp.write('%d\n' %(max))
2893 def sys_make_devices():
2894 if not os.access('/dev/portals', os.R_OK):
2895 run('mknod /dev/portals c 10 240')
2896 if not os.access('/dev/obd', os.R_OK):
2897 run('mknod /dev/obd c 10 241')
2900 # Add dir to the global PATH, if not already there.
2901 def add_to_path(new_dir):
2902 syspath = string.split(os.environ['PATH'], ':')
2903 if new_dir in syspath:
2905 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2907 def default_debug_path():
2908 path = '/tmp/lustre-log'
2909 if os.path.isdir('/r'):
2914 def default_gdb_script():
2915 script = '/tmp/ogdb'
2916 if os.path.isdir('/r'):
2917 return '/r' + script
2922 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2923 # ensure basic elements are in the system path
2924 def sanitise_path():
2925 for dir in DEFAULT_PATH:
2928 # global hack for the --select handling
2930 def init_select(args):
2931 # args = [service=nodeA,service2=nodeB service3=nodeC]
2934 list = string.split(arg, ',')
2936 srv, node = string.split(entry, '=')
2937 tgt_select[srv] = node
2939 def get_select(srv):
2940 if tgt_select.has_key(srv):
2941 return tgt_select[srv]
2945 FLAG = Lustre.Options.FLAG
2946 PARAM = Lustre.Options.PARAM
2947 INTPARAM = Lustre.Options.INTPARAM
2948 PARAMLIST = Lustre.Options.PARAMLIST
2950 ('verbose,v', "Print system commands as they are run"),
2951 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2952 ('config', "Cluster config name used for LDAP query", PARAM),
2953 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2954 ('node', "Load config for <nodename>", PARAM),
2955 ('cleanup,d', "Cleans up config. (Shutdown)"),
2956 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2958 ('single_socket', "socknal option: only use one socket instead of bundle",
2960 ('failover',"""Used to shut down without saving state.
2961 This will allow this node to "give up" a service to a
2962 another node for failover purposes. This will not
2963 be a clean shutdown.""",
2965 ('gdb', """Prints message after creating gdb module script
2966 and sleeps for 5 seconds."""),
2967 ('noexec,n', """Prints the commands and steps that will be run for a
2968 config without executing them. This can used to check if a
2969 config file is doing what it should be doing"""),
2970 ('nomod', "Skip load/unload module step."),
2971 ('nosetup', "Skip device setup/cleanup step."),
2972 ('reformat', "Reformat all devices (without question)"),
2973 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2974 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2975 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2977 ('write_conf', "Save all the client config information on mds."),
2978 ('record', "Write config information on mds."),
2979 ('record_log', "Name of config record log.", PARAM),
2980 ('record_device', "MDS device name that will record the config commands",
2982 ('minlevel', "Minimum level of services to configure/cleanup",
2984 ('maxlevel', """Maximum level of services to configure/cleanup
2985 Levels are aproximatly like:
2990 70 - mountpoint, echo_client, osc, mdc, lov""",
2992 ('lustre', """Base directory of lustre sources. This parameter will
2993 cause lconf to load modules from a source tree.""", PARAM),
2994 ('portals', """Portals source directory. If this is a relative path,
2995 then it is assumed to be relative to lustre. """, PARAM),
2996 ('timeout', "Set recovery timeout", INTPARAM),
2997 ('upcall', "Set both portals and lustre upcall script", PARAM),
2998 ('lustre_upcall', "Set lustre upcall script", PARAM),
2999 ('portals_upcall', "Set portals upcall script", PARAM),
3000 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
3001 ('ptldebug', "Set the portals debug level", PARAM),
3002 ('subsystem', "Set the portals debug subsystem", PARAM),
3003 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
3004 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
3005 # Client recovery options
3006 ('recover', "Recover a device"),
3007 ('group', "The group of devices to configure or cleanup", PARAM),
3008 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
3009 ('client_uuid', "The failed client (required for recovery)", PARAM),
3010 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
3012 ('inactive', """The name of an inactive service, to be ignored during
3013 mounting (currently OST-only). Can be repeated.""",
3018 global lctl, config, toplustreDB, CONFIG_FILE
3020 # in the upcall this is set to SIG_IGN
3021 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
3023 cl = Lustre.Options("lconf", "config.xml", lconf_options)
3025 config, args = cl.parse(sys.argv[1:])
3026 except Lustre.OptionError, e:
3030 setupModulePath(sys.argv[0])
3032 host = socket.gethostname()
3034 # the PRNG is normally seeded with time(), which is not so good for starting
3035 # time-synchronized clusters
3036 input = open('/dev/urandom', 'r')
3038 print 'Unable to open /dev/urandom!'
3040 seed = input.read(32)
3046 init_select(config.select)
3049 # allow config to be fetched via HTTP, but only with python2
3050 if sys.version[0] != '1' and args[0].startswith('http://'):
3053 config_file = urllib2.urlopen(args[0])
3054 except (urllib2.URLError, socket.error), err:
3055 if hasattr(err, 'args'):
3057 print "Could not access '%s': %s" %(args[0], err)
3059 elif not os.access(args[0], os.R_OK):
3060 print 'File not found or readable:', args[0]
3064 config_file = open(args[0], 'r')
3066 dom = xml.dom.minidom.parse(config_file)
3068 panic("%s does not appear to be a config file." % (args[0]))
3069 sys.exit(1) # make sure to die here, even in debug mode.
3071 CONFIG_FILE = args[0]
3072 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
3073 if not config.config:
3074 config.config = os.path.basename(args[0])# use full path?
3075 if config.config[-4:] == '.xml':
3076 config.config = config.config[:-4]
3077 elif config.ldapurl:
3078 if not config.config:
3079 panic("--ldapurl requires --config name")
3080 dn = "config=%s,fs=lustre" % (config.config)
3081 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
3082 elif config.ptldebug or config.subsystem:
3083 sys_set_ptldebug(None)
3084 sys_set_subsystem(None)
3087 print 'Missing config file or ldap URL.'
3088 print 'see lconf --help for command summary'
3091 toplustreDB = lustreDB
3093 ver = lustreDB.get_version()
3095 panic("No version found in config data, please recreate.")
3096 if ver != Lustre.CONFIG_VERSION:
3097 panic("Config version", ver, "does not match lconf version",
3098 Lustre.CONFIG_VERSION)
3102 node_list.append(config.node)
3105 node_list.append(host)
3106 node_list.append('localhost')
3108 debug("configuring for host: ", node_list)
3111 config.debug_path = config.debug_path + '-' + host
3112 config.gdb_script = config.gdb_script + '-' + host
3114 lctl = LCTLInterface('lctl')
3116 if config.lctl_dump:
3117 lctl.use_save_file(config.lctl_dump)
3120 if not (config.record_device and config.record_log):
3121 panic("When recording, both --record_log and --record_device must be specified.")
3122 lctl.clear_log(config.record_device, config.record_log)
3123 lctl.record(config.record_device, config.record_log)
3125 doHost(lustreDB, node_list)
3130 if __name__ == "__main__":
3133 except Lustre.LconfError, e:
3135 # traceback.print_exc(file=sys.stdout)
3137 except CommandError, e:
3141 if first_cleanup_error:
3142 sys.exit(first_cleanup_error)