3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
91 "undefined" : (0 << 24),
100 "ext2obd" : (9 << 24),
101 "portals" : (10 << 24),
102 "socknal" : (11 << 24),
103 "qswnal" : (12 << 24),
104 "pinger" : (13 << 24),
105 "filter" : (14 << 24),
106 "trace" : (15 << 24),
110 "gmnal" : (19 << 24),
111 "ptlrouter" : (20 << 24),
113 "ptlbd" : (22 << 24),
119 first_cleanup_error = 0
120 def cleanup_error(rc):
121 global first_cleanup_error
122 if not first_cleanup_error:
123 first_cleanup_error = rc
125 # ============================================================
126 # debugging and error funcs
128 def fixme(msg = "this feature"):
129 raise Lustre.LconfError, msg + ' not implmemented yet.'
132 msg = string.join(map(str,args))
133 if not config.noexec:
134 raise Lustre.LconfError(msg)
139 msg = string.join(map(str,args))
144 print string.strip(s)
148 msg = string.join(map(str,args))
151 # ack, python's builtin int() does not support '0x123' syntax.
152 # eval can do it, although what a hack!
156 return eval(s, {}, {})
159 except SyntaxError, e:
160 raise ValueError("not a number")
162 raise ValueError("not a number")
164 # ============================================================
165 # locally defined exceptions
166 class CommandError (exceptions.Exception):
167 def __init__(self, cmd_name, cmd_err, rc=None):
168 self.cmd_name = cmd_name
169 self.cmd_err = cmd_err
174 if type(self.cmd_err) == types.StringType:
176 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
178 print "! %s: %s" % (self.cmd_name, self.cmd_err)
179 elif type(self.cmd_err) == types.ListType:
181 print "! %s (error %d):" % (self.cmd_name, self.rc)
183 print "! %s:" % (self.cmd_name)
184 for s in self.cmd_err:
185 print "> %s" %(string.strip(s))
190 # ============================================================
191 # handle daemons, like the acceptor
193 """ Manage starting and stopping a daemon. Assumes daemon manages
194 it's own pid file. """
196 def __init__(self, cmd):
202 log(self.command, "already running.")
204 self.path = find_prog(self.command)
206 panic(self.command, "not found.")
207 ret, out = runcmd(self.path +' '+ self.command_line())
209 raise CommandError(self.path, out, ret)
213 pid = self.read_pidfile()
215 log ("killing process", pid)
217 #time.sleep(1) # let daemon die
219 log("unable to kill", self.command, e)
221 log("unable to kill", self.command)
224 pid = self.read_pidfile()
234 def read_pidfile(self):
236 fp = open(self.pidfile(), 'r')
243 def clean_pidfile(self):
244 """ Remove a stale pidfile """
245 log("removing stale pidfile:", self.pidfile())
247 os.unlink(self.pidfile())
249 log(self.pidfile(), e)
251 class AcceptorHandler(DaemonHandler):
252 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
253 DaemonHandler.__init__(self, "acceptor")
256 self.send_mem = send_mem
257 self.recv_mem = recv_mem
259 if net_type == 'toe':
260 self.flags = self.flags + ' -N 4'
262 self.flags = self.flags + ' -i'
265 return "/var/run/%s-%d.pid" % (self.command, self.port)
267 def command_line(self):
268 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
272 # start the acceptors
274 if config.lctl_dump or config.record:
276 for port in acceptors.keys():
277 daemon = acceptors[port]
278 if not daemon.running():
281 def run_one_acceptor(port):
282 if config.lctl_dump or config.record:
284 if acceptors.has_key(port):
285 daemon = acceptors[port]
286 if not daemon.running():
289 panic("run_one_acceptor: No acceptor defined for port:", port)
291 def stop_acceptor(port):
292 if acceptors.has_key(port):
293 daemon = acceptors[port]
298 # ============================================================
299 # handle lctl interface
302 Manage communication with lctl
305 def __init__(self, cmd):
307 Initialize close by finding the lctl binary.
309 self.lctl = find_prog(cmd)
311 self.record_device = ''
314 debug('! lctl not found')
317 raise CommandError('lctl', "unable to find lctl binary.")
319 def use_save_file(self, file):
320 self.save_file = file
322 def record(self, dev_name, logname):
323 log("Recording log", logname, "on", dev_name)
324 self.record_device = dev_name
325 self.record_log = logname
327 def end_record(self):
328 log("End recording log", self.record_log, "on", self.record_device)
329 self.record_device = None
330 self.record_log = None
332 def set_nonblock(self, fd):
333 fl = fcntl.fcntl(fd, F_GETFL)
334 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
339 the cmds are written to stdin of lctl
340 lctl doesn't return errors when run in script mode, so
342 should modify command line to accept multiple commands, or
343 create complex command line options
347 cmds = '\n dump ' + self.save_file + '\n' + cmds
348 elif self.record_device:
352 %s""" % (self.record_device, self.record_log, cmds)
354 debug("+", cmd_line, cmds)
355 if config.noexec: return (0, [])
357 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
358 child.tochild.write(cmds + "\n")
359 child.tochild.close()
361 # From "Python Cookbook" from O'Reilly
362 outfile = child.fromchild
363 outfd = outfile.fileno()
364 self.set_nonblock(outfd)
365 errfile = child.childerr
366 errfd = errfile.fileno()
367 self.set_nonblock(errfd)
369 outdata = errdata = ''
372 ready = select.select([outfd,errfd],[],[]) # Wait for input
373 if outfd in ready[0]:
374 outchunk = outfile.read()
375 if outchunk == '': outeof = 1
376 outdata = outdata + outchunk
377 if errfd in ready[0]:
378 errchunk = errfile.read()
379 if errchunk == '': erreof = 1
380 errdata = errdata + errchunk
381 if outeof and erreof: break
382 # end of "borrowed" code
385 if os.WIFEXITED(ret):
386 rc = os.WEXITSTATUS(ret)
389 if rc or len(errdata):
390 raise CommandError(self.lctl, errdata, rc)
393 def runcmd(self, *args):
395 run lctl using the command line
397 cmd = string.join(map(str,args))
398 debug("+", self.lctl, cmd)
399 rc, out = run(self.lctl, cmd)
401 raise CommandError(self.lctl, out, rc)
405 def network(self, net, nid):
410 quit """ % (net, nid)
413 # create a new connection
414 def add_uuid(self, net_type, uuid, nid):
415 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
418 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
420 if net_type in ('tcp', 'toe') and not config.lctl_dump:
425 add_autoconn %s %s %d %s
429 nid, hostaddr, port, flags )
432 def connect(self, srv):
433 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
434 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
438 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
439 srv.nid, srv.hostaddr, srv.port, flags)
442 def recover(self, dev_name, new_conn):
445 recover %s""" %(dev_name, new_conn)
448 # add a route to a range
449 def add_route(self, net, gw, lo, hi):
457 except CommandError, e:
461 def del_route(self, net, gw, lo, hi):
466 quit """ % (net, gw, lo, hi)
469 # add a route to a host
470 def add_route_host(self, net, uuid, gw, tgt):
471 self.add_uuid(net, uuid, tgt)
479 except CommandError, e:
483 # add a route to a range
484 def del_route_host(self, net, uuid, gw, tgt):
490 quit """ % (net, gw, tgt)
494 def del_autoconn(self, net_type, nid, hostaddr):
495 if net_type in ('tcp', 'toe') and not config.lctl_dump:
504 # disconnect one connection
505 def disconnect(self, srv):
506 self.del_uuid(srv.nid_uuid)
507 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
508 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
510 def del_uuid(self, uuid):
518 def disconnectAll(self, net):
526 def attach(self, type, name, uuid):
529 quit""" % (type, name, uuid)
532 def setup(self, name, setup = ""):
536 quit""" % (name, setup)
540 # create a new device with lctl
541 def newdev(self, type, name, uuid, setup = ""):
542 self.attach(type, name, uuid);
544 self.setup(name, setup)
545 except CommandError, e:
546 self.cleanup(name, uuid, 0)
551 def cleanup(self, name, uuid, force, failover = 0):
552 if failover: force = 1
558 quit""" % (name, ('', 'force')[force],
559 ('', 'failover')[failover])
563 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
564 stripe_sz, stripe_off,
568 lov_setup %s %d %d %d %s %s
569 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
574 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
578 lov_setconfig %s %d %d %d %s %s
579 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
583 def dump(self, dump_file):
586 quit""" % (dump_file)
589 # get list of devices
590 def device_list(self):
591 devices = '/proc/fs/lustre/devices'
593 if os.access(devices, os.R_OK):
595 fp = open(devices, 'r')
603 def lustre_version(self):
604 rc, out = self.runcmd('version')
608 def mount_option(self, profile, osc, mdc):
610 mount_option %s %s %s
611 quit""" % (profile, osc, mdc)
614 # delete mount options
615 def del_mount_option(self, profile):
621 def set_timeout(self, timeout):
627 # delete mount options
628 def set_lustre_upcall(self, upcall):
633 # ============================================================
634 # Various system-level functions
635 # (ideally moved to their own module)
637 # Run a command and return the output and status.
638 # stderr is sent to /dev/null, could use popen3 to
639 # save it if necessary
642 if config.noexec: return (0, [])
643 f = os.popen(cmd + ' 2>&1')
653 cmd = string.join(map(str,args))
656 # Run a command in the background.
657 def run_daemon(*args):
658 cmd = string.join(map(str,args))
660 if config.noexec: return 0
661 f = os.popen(cmd + ' 2>&1')
669 # Determine full path to use for an external command
670 # searches dirname(argv[0]) first, then PATH
672 syspath = string.split(os.environ['PATH'], ':')
673 cmdpath = os.path.dirname(sys.argv[0])
674 syspath.insert(0, cmdpath);
676 syspath.insert(0, os.path.join(config.portals, 'utils/'))
678 prog = os.path.join(d,cmd)
679 if os.access(prog, os.X_OK):
683 # Recursively look for file starting at base dir
684 def do_find_file(base, mod):
685 fullname = os.path.join(base, mod)
686 if os.access(fullname, os.R_OK):
688 for d in os.listdir(base):
689 dir = os.path.join(base,d)
690 if os.path.isdir(dir):
691 module = do_find_file(dir, mod)
695 def find_module(src_dir, dev_dir, modname):
696 mod = '%s.o' % (modname)
697 module = src_dir +'/'+ dev_dir +'/'+ mod
699 if os.access(module, os.R_OK):
705 # is the path a block device?
712 return stat.S_ISBLK(s[stat.ST_MODE])
714 # build fs according to type
716 def mkfs(dev, devsize, fstype, jsize, mkfsoptions, isblock=1):
721 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
723 # devsize is in 1k, and fs block count is in 4k
724 block_cnt = devsize/4
726 if fstype in ('ext3', 'extN'):
727 # ext3 journal size is in megabytes
728 if jsize: jopt = "-J size=%d" %(jsize,)
729 mkfs = 'mkfs.ext2 -j -b 4096 '
730 if not isblock or config.force:
732 elif fstype == 'reiserfs':
733 # reiserfs journal size is in blocks
734 if jsize: jopt = "--journal_size %d" %(jsize,)
735 mkfs = 'mkreiserfs -ff'
737 panic('unsupported fs type: ', fstype)
739 if config.mkfsoptions != None:
740 mkfs = mkfs + ' ' + config.mkfsoptions
741 if mkfsoptions != None:
742 mkfs = mkfs + ' ' + mkfsoptions
743 (ret, out) = run (mkfs, jopt, dev, block_cnt)
745 panic("Unable to build fs:", dev, string.join(out))
746 # enable hash tree indexing on fsswe
747 if fstype in ('ext3', 'extN'):
748 htree = 'echo "feature FEATURE_C5" | debugfs -w'
749 (ret, out) = run (htree, dev)
751 panic("Unable to enable htree:", dev)
753 # some systems use /dev/loopN, some /dev/loop/N
757 if not os.access(loop + str(0), os.R_OK):
759 if not os.access(loop + str(0), os.R_OK):
760 panic ("can't access loop devices")
763 # find loop device assigned to thefile
766 for n in xrange(0, MAX_LOOP_DEVICES):
768 if os.access(dev, os.R_OK):
769 (stat, out) = run('losetup', dev)
770 if out and stat == 0:
771 m = re.search(r'\((.*)\)', out[0])
772 if m and file == m.group(1):
778 # create file if necessary and assign the first free loop device
779 def init_loop(file, size, fstype, journal_size, mkfsoptions, reformat):
780 dev = find_loop(file)
782 print 'WARNING file:', file, 'already mapped to', dev
784 if reformat or not os.access(file, os.R_OK | os.W_OK):
786 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
787 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
790 panic("Unable to create backing store:", file)
791 mkfs(file, size, fstype, journal_size, mkfsoptions, isblock=0)
794 # find next free loop
795 for n in xrange(0, MAX_LOOP_DEVICES):
797 if os.access(dev, os.R_OK):
798 (stat, out) = run('losetup', dev)
800 run('losetup', dev, file)
803 print "out of loop devices"
805 print "out of loop devices"
808 # undo loop assignment
809 def clean_loop(file):
810 dev = find_loop(file)
812 ret, out = run('losetup -d', dev)
814 log('unable to clean loop device:', dev, 'for file:', file)
817 # determine if dev is formatted as a <fstype> filesystem
818 def need_format(fstype, dev):
819 # FIXME don't know how to implement this
822 # initialize a block device if needed
823 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
825 if config.noexec: return dev
826 if not is_block(dev):
827 dev = init_loop(dev, size, fstype, journal_size, mkfsoptions, reformat)
828 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
829 mkfs(dev, size, fstype, journal_size, mkfsoptions, isblock=0)
832 # panic("device:", dev,
833 # "not prepared, and autoformat is not set.\n",
834 # "Rerun with --reformat option to format ALL filesystems")
839 """lookup IP address for an interface"""
840 rc, out = run("/sbin/ifconfig", iface)
843 addr = string.split(out[1])[1]
844 ip = string.split(addr, ':')[1]
847 def sys_get_local_nid(net_type, wildcard, cluster_id):
848 """Return the local nid."""
850 if os.access('/proc/elan/device0/position', os.R_OK):
851 local = sys_get_local_address('elan', '*', cluster_id)
853 local = sys_get_local_address(net_type, wildcard, cluster_id)
856 def sys_get_local_address(net_type, wildcard, cluster_id):
857 """Return the local address for the network type."""
859 if net_type in ('tcp', 'toe'):
861 iface, star = string.split(wildcard, ':')
862 local = if2addr(iface)
864 panic ("unable to determine ip for:", wildcard)
866 host = socket.gethostname()
867 local = socket.gethostbyname(host)
868 elif net_type == 'elan':
869 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
871 fp = open('/proc/elan/device0/position', 'r')
872 lines = fp.readlines()
880 nid = my_int(cluster_id) + my_int(elan_id)
882 except ValueError, e:
886 elif net_type == 'gm':
887 fixme("automatic local address for GM")
888 elif net_type == 'scimac':
889 scinode="/opt/scali/sbin/scinode"
890 if os.path.exists(scinode):
891 (rc,local) = run(scinode)
893 panic (scinode, " not found on node with scimac networking")
895 panic (scinode, " failed")
896 local=string.rstrip(local[0])
900 def mod_loaded(modname):
901 """Check if a module is already loaded. Look in /proc/modules for it."""
903 fp = open('/proc/modules')
904 lines = fp.readlines()
906 # please forgive my tired fingers for this one
907 ret = filter(lambda word, mod=modname: word == mod,
908 map(lambda line: string.split(line)[0], lines))
913 # XXX: instead of device_list, ask for $name and see what we get
914 def is_prepared(name):
915 """Return true if a device exists for the name"""
918 if (config.noexec or config.record) and config.cleanup:
921 # expect this format:
922 # 1 UP ldlm ldlm ldlm_UUID 2
923 out = lctl.device_list()
925 if name == string.split(s)[3]:
927 except CommandError, e:
931 def is_network_prepared():
932 """If the any device exists, then assume that all networking
933 has been configured"""
934 out = lctl.device_list()
937 def fs_is_mounted(path):
938 """Return true if path is a mounted lustre filesystem"""
940 fp = open('/proc/mounts')
941 lines = fp.readlines()
945 if a[1] == path and a[2] == 'lustre_lite':
953 """Manage kernel modules"""
954 def __init__(self, lustre_dir, portals_dir):
955 self.lustre_dir = lustre_dir
956 self.portals_dir = portals_dir
957 self.kmodule_list = []
959 def add_portals_module(self, dev_dir, modname):
960 """Append a module to list of modules to load."""
961 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
963 def add_lustre_module(self, dev_dir, modname):
964 """Append a module to list of modules to load."""
965 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
967 def load_module(self):
968 """Load all the modules in the list in the order they appear."""
969 for src_dir, dev_dir, mod in self.kmodule_list:
970 if mod_loaded(mod) and not config.noexec:
972 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
974 module = find_module(src_dir, dev_dir, mod)
976 panic('module not found:', mod)
977 (rc, out) = run('/sbin/insmod', module)
979 raise CommandError('insmod', out, rc)
981 (rc, out) = run('/sbin/modprobe', mod)
983 raise CommandError('modprobe', out, rc)
985 def cleanup_module(self):
986 """Unload the modules in the list in reverse order."""
987 rev = self.kmodule_list
989 for src_dir, dev_dir, mod in rev:
990 if not mod_loaded(mod) and not config.noexec:
993 if mod == 'portals' and config.dump:
994 lctl.dump(config.dump)
995 log('unloading module:', mod)
996 (rc, out) = run('/sbin/rmmod', mod)
998 log('! unable to unload module:', mod)
1001 # ============================================================
1002 # Classes to prepare and cleanup the various objects
1005 """ Base class for the rest of the modules. The default cleanup method is
1006 defined here, as well as some utilitiy funcs.
1008 def __init__(self, module_name, db):
1010 self.module_name = module_name
1011 self.name = self.db.getName()
1012 self.uuid = self.db.getUUID()
1015 self.kmod = kmod(config.lustre, config.portals)
1017 def info(self, *args):
1018 msg = string.join(map(str,args))
1019 print self.module_name + ":", self.name, self.uuid, msg
1022 """ default cleanup, used for most modules """
1025 lctl.cleanup(self.name, self.uuid, config.force)
1026 except CommandError, e:
1027 log(self.module_name, "cleanup failed: ", self.name)
1031 def add_portals_module(self, dev_dir, modname):
1032 """Append a module to list of modules to load."""
1033 self.kmod.add_portals_module(dev_dir, modname)
1035 def add_lustre_module(self, dev_dir, modname):
1036 """Append a module to list of modules to load."""
1037 self.kmod.add_lustre_module(dev_dir, modname)
1039 def load_module(self):
1040 """Load all the modules in the list in the order they appear."""
1041 self.kmod.load_module()
1043 def cleanup_module(self):
1044 """Unload the modules in the list in reverse order."""
1045 if self.safe_to_clean():
1046 self.kmod.cleanup_module()
1048 def safe_to_clean(self):
1051 def safe_to_clean_modules(self):
1052 return self.safe_to_clean()
1054 class Network(Module):
1055 def __init__(self,db):
1056 Module.__init__(self, 'NETWORK', db)
1057 self.net_type = self.db.get_val('nettype')
1058 self.nid = self.db.get_val('nid', '*')
1059 self.cluster_id = self.db.get_val('clusterid', "0")
1060 self.port = self.db.get_val_int('port', 0)
1061 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1062 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1063 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1066 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1068 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1069 self.generic_nid = 1
1070 debug("nid:", self.nid)
1072 self.generic_nid = 0
1074 self.nid_uuid = self.nid_to_uuid(self.nid)
1076 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1077 if '*' in self.hostaddr:
1078 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1079 if not self.hostaddr:
1080 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1081 debug("hostaddr:", self.hostaddr)
1083 self.add_portals_module("libcfs", 'portals')
1084 if node_needs_router():
1085 self.add_portals_module("router", 'kptlrouter')
1086 if self.net_type == 'tcp':
1087 self.add_portals_module("knals/socknal", 'ksocknal')
1088 if self.net_type == 'toe':
1089 self.add_portals_module("knals/toenal", 'ktoenal')
1090 if self.net_type == 'elan':
1091 self.add_portals_module("knals/qswnal", 'kqswnal')
1092 if self.net_type == 'gm':
1093 self.add_portals_module("knals/gmnal", 'kgmnal')
1094 if self.net_type == 'scimac':
1095 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1097 def nid_to_uuid(self, nid):
1098 return "NID_%s_UUID" %(nid,)
1101 if is_network_prepared():
1103 self.info(self.net_type, self.nid, self.port)
1104 if not (config.record and self.generic_nid):
1105 lctl.network(self.net_type, self.nid)
1106 if self.net_type == 'tcp':
1108 if self.net_type == 'elan':
1110 if self.port and node_is_router():
1111 run_one_acceptor(self.port)
1112 self.connect_peer_gateways()
1114 def connect_peer_gateways(self):
1115 for router in self.db.lookup_class('node'):
1116 if router.get_val_int('router', 0):
1117 for netuuid in router.get_networks():
1118 net = self.db.lookup(netuuid)
1120 if (gw.cluster_id == self.cluster_id and
1121 gw.net_type == self.net_type):
1122 if gw.nid != self.nid:
1125 def disconnect_peer_gateways(self):
1126 for router in self.db.lookup_class('node'):
1127 if router.get_val_int('router', 0):
1128 for netuuid in router.get_networks():
1129 net = self.db.lookup(netuuid)
1131 if (gw.cluster_id == self.cluster_id and
1132 gw.net_type == self.net_type):
1133 if gw.nid != self.nid:
1136 except CommandError, e:
1137 print "disconnect failed: ", self.name
1141 def safe_to_clean(self):
1142 return not is_network_prepared()
1145 self.info(self.net_type, self.nid, self.port)
1147 stop_acceptor(self.port)
1148 if node_is_router():
1149 self.disconnect_peer_gateways()
1151 class RouteTable(Module):
1152 def __init__(self,db):
1153 Module.__init__(self, 'ROUTES', db)
1155 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1157 # only setup connections for tcp NALs
1159 if not net_type in ('tcp', 'toe'):
1162 # connect to target if route is to single node and this node is the gw
1163 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1164 if not local_cluster(net_type, tgt_cluster_id):
1165 panic("target", lo, " not on the local cluster")
1166 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1167 # connect to gateway if this node is not the gw
1168 elif (local_cluster(net_type, gw_cluster_id)
1169 and not local_interface(net_type, gw_cluster_id, gw)):
1170 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1175 panic("no server for nid", lo)
1178 return Network(srvdb)
1181 if is_network_prepared():
1184 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1185 lctl.add_route(net_type, gw, lo, hi)
1186 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1190 def safe_to_clean(self):
1191 return not is_network_prepared()
1194 if is_network_prepared():
1195 # the network is still being used, don't clean it up
1197 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1198 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1201 lctl.disconnect(srv)
1202 except CommandError, e:
1203 print "disconnect failed: ", self.name
1208 lctl.del_route(net_type, gw, lo, hi)
1209 except CommandError, e:
1210 print "del_route failed: ", self.name
1214 class Management(Module):
1215 def __init__(self, db):
1216 Module.__init__(self, 'MGMT', db)
1217 self.add_lustre_module('lvfs', 'lvfs')
1218 self.add_lustre_module('obdclass', 'obdclass')
1219 self.add_lustre_module('ptlrpc', 'ptlrpc')
1220 self.add_lustre_module('mgmt', 'mgmt_svc')
1223 if is_prepared(self.name):
1226 lctl.newdev("mgmt", self.name, self.uuid)
1228 def safe_to_clean(self):
1232 if is_prepared(self.name):
1233 Module.cleanup(self)
1235 # This is only needed to load the modules; the LDLM device
1236 # is now created automatically.
1238 def __init__(self,db):
1239 Module.__init__(self, 'LDLM', db)
1240 self.add_lustre_module('lvfs', 'lvfs')
1241 self.add_lustre_module('obdclass', 'obdclass')
1242 self.add_lustre_module('ptlrpc', 'ptlrpc')
1251 def __init__(self, db, uuid, fs_name, name_override = None):
1252 Module.__init__(self, 'LOV', db)
1253 if name_override != None:
1254 self.name = "lov_%s" % name_override
1255 self.add_lustre_module('lov', 'lov')
1256 self.mds_uuid = self.db.get_first_ref('mds')
1257 mds= self.db.lookup(self.mds_uuid)
1258 self.mds_name = mds.getName()
1259 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1260 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1261 self.pattern = self.db.get_val_int('stripepattern', 0)
1262 self.devlist = self.db.get_refs('obd')
1263 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1265 self.desc_uuid = self.uuid
1266 self.uuid = generate_client_uuid(self.name)
1267 self.fs_name = fs_name
1268 for obd_uuid in self.devlist:
1269 obd = self.db.lookup(obd_uuid)
1270 osc = get_osc(obd, self.uuid, fs_name)
1272 self.osclist.append(osc)
1274 panic('osc not found:', obd_uuid)
1277 if is_prepared(self.name):
1279 for osc in self.osclist:
1281 # Only ignore connect failures with --force, which
1282 # isn't implemented here yet.
1283 osc.prepare(ignore_connect_failure=0)
1284 except CommandError, e:
1285 print "Error preparing OSC %s\n" % osc.uuid
1287 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1288 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1289 lctl.lov_setup(self.name, self.uuid,
1290 self.desc_uuid, self.mds_name, self.stripe_cnt,
1291 self.stripe_sz, self.stripe_off, self.pattern,
1292 string.join(self.devlist))
1295 if is_prepared(self.name):
1296 Module.cleanup(self)
1297 for osc in self.osclist:
1300 def load_module(self):
1301 for osc in self.osclist:
1304 Module.load_module(self)
1306 def cleanup_module(self):
1307 Module.cleanup_module(self)
1308 for osc in self.osclist:
1309 osc.cleanup_module()
1312 class MDSDEV(Module):
1313 def __init__(self,db):
1314 Module.__init__(self, 'MDSDEV', db)
1315 self.devpath = self.db.get_val('devpath','')
1316 self.size = self.db.get_val_int('devsize', 0)
1317 self.journal_size = self.db.get_val_int('journalsize', 0)
1318 self.fstype = self.db.get_val('fstype', '')
1319 self.nspath = self.db.get_val('nspath', '')
1320 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1321 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1322 target_uuid = self.db.get_first_ref('target')
1323 mds = self.db.lookup(target_uuid)
1324 self.name = mds.getName()
1325 self.filesystem_uuids = mds.get_refs('filesystem')
1326 # FIXME: if fstype not set, then determine based on kernel version
1327 self.format = self.db.get_val('autoformat', "no")
1328 if mds.get_val('failover', 0):
1329 self.failover_mds = 'f'
1331 self.failover_mds = 'n'
1332 active_uuid = get_active_target(mds)
1334 panic("No target device found:", target_uuid)
1335 if active_uuid == self.uuid:
1339 if self.active and config.group and config.group != ost.get_val('group'):
1342 self.target_dev_uuid = self.uuid
1343 self.uuid = target_uuid
1345 self.add_lustre_module('mdc', 'mdc')
1346 self.add_lustre_module('osc', 'osc')
1347 self.add_lustre_module('lov', 'lov')
1348 self.add_lustre_module('mds', 'mds')
1350 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1353 def load_module(self):
1355 Module.load_module(self)
1358 if is_prepared(self.name):
1361 debug(self.uuid, "not active")
1364 # run write_conf automatically, if --reformat used
1366 self.info(self.devpath, self.fstype, self.size, self.format)
1368 # never reformat here
1369 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1370 self.format, self.journal_size, self.mkfsoptions)
1371 if not is_prepared('MDT'):
1372 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1374 lctl.newdev("mds", self.name, self.uuid,
1375 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1376 except CommandError, e:
1378 panic("MDS is missing the config log. Need to run " +
1379 "lconf --write_conf.")
1383 def write_conf(self):
1384 if is_prepared(self.name):
1386 self.info(self.devpath, self.fstype, self.format)
1387 blkdev = block_dev(self.devpath, self.size, self.fstype,
1388 config.reformat, self.format, self.journal_size,
1390 lctl.newdev("mds", self.name, self.uuid,
1391 setup ="%s %s" %(blkdev, self.fstype))
1393 # record logs for the MDS lov
1394 for uuid in self.filesystem_uuids:
1395 log("recording clients for filesystem:", uuid)
1396 fs = self.db.lookup(uuid)
1397 obd_uuid = fs.get_first_ref('obd')
1398 client_uuid = generate_client_uuid(self.name)
1399 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1402 lctl.record(self.name, self.name)
1404 lctl.mount_option(self.name, client.get_name(), "")
1408 lctl.record(self.name, self.name + '-clean')
1410 lctl.del_mount_option(self.name)
1415 # record logs for each client
1417 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1419 config_options = CONFIG_FILE
1421 for node_db in self.db.lookup_class('node'):
1422 client_name = node_db.getName()
1423 for prof_uuid in node_db.get_refs('profile'):
1424 prof_db = node_db.lookup(prof_uuid)
1425 # refactor this into a funtion to test "clientness"
1427 for ref_class, ref_uuid in prof_db.get_all_refs():
1428 if ref_class in ('mountpoint','echoclient'):
1429 debug("recording", client_name)
1430 old_noexec = config.noexec
1432 noexec_opt = ('', '-n')
1433 ret, out = run (sys.argv[0],
1434 noexec_opt[old_noexec == 1],
1435 " -v --record --nomod",
1436 "--record_log", client_name,
1437 "--record_device", self.name,
1438 "--node", client_name,
1441 for s in out: log("record> ", string.strip(s))
1442 ret, out = run (sys.argv[0],
1443 noexec_opt[old_noexec == 1],
1444 "--cleanup -v --record --nomod",
1445 "--record_log", client_name + "-clean",
1446 "--record_device", self.name,
1447 "--node", client_name,
1450 for s in out: log("record> ", string.strip(s))
1451 config.noexec = old_noexec
1453 lctl.cleanup(self.name, self.uuid, 0, 0)
1454 except CommandError, e:
1455 log(self.module_name, "cleanup failed: ", self.name)
1458 Module.cleanup(self)
1459 clean_loop(self.devpath)
1461 def msd_remaining(self):
1462 out = lctl.device_list()
1464 if string.split(s)[2] in ('mds',):
1467 def safe_to_clean(self):
1470 def safe_to_clean_modules(self):
1471 return not self.msd_remaining()
1475 debug(self.uuid, "not active")
1478 if is_prepared(self.name):
1480 lctl.cleanup(self.name, self.uuid, config.force,
1482 except CommandError, e:
1483 log(self.module_name, "cleanup failed: ", self.name)
1486 Module.cleanup(self)
1487 if not self.msd_remaining() and is_prepared('MDT'):
1489 lctl.cleanup("MDT", "MDT_UUID", config.force,
1491 except CommandError, e:
1492 print "cleanup failed: ", self.name
1495 clean_loop(self.devpath)
1498 def __init__(self, db):
1499 Module.__init__(self, 'OSD', db)
1500 self.osdtype = self.db.get_val('osdtype')
1501 self.devpath = self.db.get_val('devpath', '')
1502 self.size = self.db.get_val_int('devsize', 0)
1503 self.journal_size = self.db.get_val_int('journalsize', 0)
1504 self.mkfsoptions = self.db.get_val_int('mkfsoptions', '')
1505 self.fstype = self.db.get_val('fstype', '')
1506 self.nspath = self.db.get_val('nspath', '')
1507 target_uuid = self.db.get_first_ref('target')
1508 ost = self.db.lookup(target_uuid)
1509 self.name = ost.getName()
1510 self.format = self.db.get_val('autoformat', 'yes')
1511 if ost.get_val('failover', 0):
1512 self.failover_ost = 'f'
1514 self.failover_ost = 'n'
1516 active_uuid = get_active_target(ost)
1518 panic("No target device found:", target_uuid)
1519 if active_uuid == self.uuid:
1523 if self.active and config.group and config.group != ost.get_val('group'):
1526 self.target_dev_uuid = self.uuid
1527 self.uuid = target_uuid
1529 self.add_lustre_module('ost', 'ost')
1530 # FIXME: should we default to ext3 here?
1532 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1533 self.add_lustre_module(self.osdtype, self.osdtype)
1535 def load_module(self):
1537 Module.load_module(self)
1539 # need to check /proc/mounts and /etc/mtab before
1540 # formatting anything.
1541 # FIXME: check if device is already formatted.
1543 if is_prepared(self.name):
1546 debug(self.uuid, "not active")
1548 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1549 self.format, self.journal_size)
1551 if self.osdtype == 'obdecho':
1554 blkdev = block_dev(self.devpath, self.size, self.fstype,
1555 config.reformat, self.format, self.journal_size,
1557 lctl.newdev(self.osdtype, self.name, self.uuid,
1558 setup ="%s %s %s" %(blkdev, self.fstype,
1560 if not is_prepared('OSS'):
1561 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1563 def osd_remaining(self):
1564 out = lctl.device_list()
1566 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1569 def safe_to_clean(self):
1572 def safe_to_clean_modules(self):
1573 return not self.osd_remaining()
1577 debug(self.uuid, "not active")
1579 if is_prepared(self.name):
1582 lctl.cleanup(self.name, self.uuid, config.force,
1584 except CommandError, e:
1585 log(self.module_name, "cleanup failed: ", self.name)
1588 if not self.osd_remaining() and is_prepared('OSS'):
1590 lctl.cleanup("OSS", "OSS_UUID", config.force,
1592 except CommandError, e:
1593 print "cleanup failed: ", self.name
1596 if not self.osdtype == 'obdecho':
1597 clean_loop(self.devpath)
1599 def mgmt_uuid_for_fs(mtpt_name):
1602 mtpt_db = toplevel.lookup_name(mtpt_name)
1603 fs_uuid = mtpt_db.get_first_ref('filesystem')
1604 fs = toplevel.lookup(fs_uuid)
1607 return fs.get_first_ref('mgmt')
1609 # Generic client module, used by OSC and MDC
1610 class Client(Module):
1611 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1613 self.target_name = tgtdb.getName()
1614 self.target_uuid = tgtdb.getUUID()
1617 self.tgt_dev_uuid = get_active_target(tgtdb)
1618 if not self.tgt_dev_uuid:
1619 panic("No target device found for target:", self.target_name)
1621 self.kmod = kmod(config.lustre, config.portals)
1625 self.module = module
1626 self.module_name = string.upper(module)
1628 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1629 self.target_name, fs_name)
1631 self.name = self_name
1633 self.lookup_server(self.tgt_dev_uuid)
1634 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1636 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1639 self.fs_name = fs_name
1642 self.add_lustre_module(module_dir, module)
1644 def lookup_server(self, srv_uuid):
1645 """ Lookup a server's network information """
1646 self._server_nets = get_ost_net(self.db, srv_uuid)
1647 if len(self._server_nets) == 0:
1648 panic ("Unable to find a server for:", srv_uuid)
1650 def get_servers(self):
1651 return self._server_nets
1653 def prepare(self, ignore_connect_failure = 0):
1654 self.info(self.target_uuid)
1655 if is_prepared(self.name):
1658 srv = choose_local_server(self.get_servers())
1662 routes = find_route(self.get_servers())
1663 if len(routes) == 0:
1664 panic ("no route to", self.target_uuid)
1665 for (srv, r) in routes:
1666 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1667 except CommandError, e:
1668 if not ignore_connect_failure:
1671 if self.target_uuid in config.inactive and self.permits_inactive():
1672 debug("%s inactive" % self.target_uuid)
1673 inactive_p = "inactive"
1675 debug("%s active" % self.target_uuid)
1677 lctl.newdev(self.module, self.name, self.uuid,
1678 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1679 inactive_p, self.mgmt_name))
1682 if is_prepared(self.name):
1683 Module.cleanup(self)
1685 srv = choose_local_server(self.get_servers())
1687 lctl.disconnect(srv)
1689 for (srv, r) in find_route(self.get_servers()):
1690 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1691 except CommandError, e:
1692 log(self.module_name, "cleanup failed: ", self.name)
1698 def __init__(self, db, uuid, fs_name):
1699 Client.__init__(self, db, uuid, 'mdc', fs_name)
1701 def permits_inactive(self):
1705 def __init__(self, db, uuid, fs_name):
1706 Client.__init__(self, db, uuid, 'osc', fs_name)
1708 def permits_inactive(self):
1711 def mgmtcli_name_for_uuid(uuid):
1712 return 'MGMTCLI_%s' % uuid
1714 class ManagementClient(Client):
1715 def __init__(self, db, uuid):
1716 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1717 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1718 module_dir = 'mgmt')
1721 def __init__(self, db):
1722 Module.__init__(self, 'COBD', db)
1723 self.real_uuid = self.db.get_first_ref('realobd')
1724 self.cache_uuid = self.db.get_first_ref('cacheobd')
1725 self.add_lustre_module('cobd' , 'cobd')
1727 # need to check /proc/mounts and /etc/mtab before
1728 # formatting anything.
1729 # FIXME: check if device is already formatted.
1731 if is_prepared(self.name):
1733 self.info(self.real_uuid, self.cache_uuid)
1734 lctl.newdev("cobd", self.name, self.uuid,
1735 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1738 # virtual interface for OSC and LOV
1740 def __init__(self, db, uuid, fs_name, name_override = None):
1741 Module.__init__(self, 'VOSC', db)
1742 if db.get_class() == 'lov':
1743 self.osc = LOV(db, uuid, fs_name, name_override)
1745 self.osc = get_osc(db, uuid, fs_name)
1747 return self.osc.uuid
1749 return self.osc.name
1754 def load_module(self):
1755 self.osc.load_module()
1756 def cleanup_module(self):
1757 self.osc.cleanup_module()
1760 class ECHO_CLIENT(Module):
1761 def __init__(self,db):
1762 Module.__init__(self, 'ECHO_CLIENT', db)
1763 self.add_lustre_module('obdecho', 'obdecho')
1764 self.obd_uuid = self.db.get_first_ref('obd')
1765 obd = self.db.lookup(self.obd_uuid)
1766 self.uuid = generate_client_uuid(self.name)
1767 self.osc = VOSC(obd, self.uuid, self.name)
1770 if is_prepared(self.name):
1773 self.osc.prepare() # XXX This is so cheating. -p
1774 self.info(self.obd_uuid)
1776 lctl.newdev("echo_client", self.name, self.uuid,
1777 setup = self.osc.get_name())
1780 if is_prepared(self.name):
1781 Module.cleanup(self)
1784 def load_module(self):
1785 self.osc.load_module()
1786 Module.load_module(self)
1788 def cleanup_module(self):
1789 Module.cleanup_module(self)
1790 self.osc.cleanup_module()
1793 def generate_client_uuid(name):
1794 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1796 int(random.random() * 1048576),
1797 int(random.random() * 1048576))
1798 return client_uuid[:36]
1801 class Mountpoint(Module):
1802 def __init__(self,db):
1803 Module.__init__(self, 'MTPT', db)
1804 self.path = self.db.get_val('path')
1805 self.fs_uuid = self.db.get_first_ref('filesystem')
1806 fs = self.db.lookup(self.fs_uuid)
1807 self.mds_uuid = fs.get_first_ref('mds')
1808 self.obd_uuid = fs.get_first_ref('obd')
1809 self.mgmt_uuid = fs.get_first_ref('mgmt')
1810 obd = self.db.lookup(self.obd_uuid)
1811 client_uuid = generate_client_uuid(self.name)
1812 self.vosc = VOSC(obd, client_uuid, self.name)
1813 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1815 self.add_lustre_module('mdc', 'mdc')
1816 self.add_lustre_module('llite', 'llite')
1818 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1824 if fs_is_mounted(self.path):
1825 log(self.path, "already mounted.")
1829 self.mgmtcli.prepare()
1832 mdc_name = self.mdc.name
1834 self.info(self.path, self.mds_uuid, self.obd_uuid)
1835 if config.record or config.lctl_dump:
1836 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1838 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1839 (self.vosc.get_name(), mdc_name, config.config, self.path)
1840 run("mkdir", self.path)
1845 panic("mount failed:", self.path, ":", string.join(val))
1848 self.info(self.path, self.mds_uuid,self.obd_uuid)
1850 if config.record or config.lctl_dump:
1851 lctl.del_mount_option(local_node_name)
1853 if fs_is_mounted(self.path):
1855 (rc, out) = run("umount", "-f", self.path)
1857 (rc, out) = run("umount", self.path)
1859 raise CommandError('umount', out, rc)
1861 if fs_is_mounted(self.path):
1862 panic("fs is still mounted:", self.path)
1867 self.mgmtcli.cleanup()
1869 def load_module(self):
1871 self.mgmtcli.load_module()
1872 self.vosc.load_module()
1873 Module.load_module(self)
1875 def cleanup_module(self):
1876 Module.cleanup_module(self)
1877 self.vosc.cleanup_module()
1879 self.mgmtcli.cleanup_module()
1882 # ============================================================
1883 # misc query functions
1885 def get_ost_net(self, osd_uuid):
1889 osd = self.lookup(osd_uuid)
1890 node_uuid = osd.get_first_ref('node')
1891 node = self.lookup(node_uuid)
1893 panic("unable to find node for osd_uuid:", osd_uuid,
1894 " node_ref:", node_uuid)
1895 for net_uuid in node.get_networks():
1896 db = node.lookup(net_uuid)
1897 srv_list.append(Network(db))
1901 # the order of iniitailization is based on level.
1902 def getServiceLevel(self):
1903 type = self.get_class()
1905 if type in ('network',):
1907 elif type in ('routetbl',):
1909 elif type in ('ldlm',):
1911 elif type in ('mgmt',):
1913 elif type in ('osd', 'cobd'):
1915 elif type in ('mdsdev',):
1917 elif type in ('mountpoint', 'echoclient'):
1920 panic("Unknown type: ", type)
1922 if ret < config.minlevel or ret > config.maxlevel:
1927 # return list of services in a profile. list is a list of tuples
1928 # [(level, db_object),]
1929 def getServices(self):
1931 for ref_class, ref_uuid in self.get_all_refs():
1932 servdb = self.lookup(ref_uuid)
1934 level = getServiceLevel(servdb)
1936 list.append((level, servdb))
1938 panic('service not found: ' + ref_uuid)
1944 ############################################################
1946 # FIXME: clean this mess up!
1948 # OSC is no longer in the xml, so we have to fake it.
1949 # this is getting ugly and begging for another refactoring
1950 def get_osc(ost_db, uuid, fs_name):
1951 osc = OSC(ost_db, uuid, fs_name)
1954 def get_mdc(db, uuid, fs_name, mds_uuid):
1955 mds_db = db.lookup(mds_uuid);
1957 panic("no mds:", mds_uuid)
1958 mdc = MDC(mds_db, uuid, fs_name)
1961 ############################################################
1962 # routing ("rooting")
1964 # list of (nettype, cluster_id, nid)
1967 def find_local_clusters(node_db):
1968 global local_clusters
1969 for netuuid in node_db.get_networks():
1970 net = node_db.lookup(netuuid)
1972 debug("add_local", netuuid)
1973 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
1975 if acceptors.has_key(srv.port):
1976 panic("duplicate port:", srv.port)
1977 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1978 srv.send_mem, srv.recv_mem,
1981 # This node is a gateway.
1983 def node_is_router():
1986 # If there are any routers found in the config, then this will be true
1987 # and all nodes will load kptlrouter.
1989 def node_needs_router():
1990 return needs_router or is_router
1992 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1993 # Currently, these local routes are only added to kptlrouter route
1994 # table if they are needed to connect to a specific server. This
1995 # should be changed so all available routes are loaded, and the
1996 # ptlrouter can make all the decisions.
1999 def find_local_routes(lustre):
2000 """ Scan the lustre config looking for routers . Build list of
2002 global local_routes, needs_router
2004 list = lustre.lookup_class('node')
2006 if router.get_val_int('router', 0):
2008 for (local_type, local_cluster_id, local_nid) in local_clusters:
2010 for netuuid in router.get_networks():
2011 db = router.lookup(netuuid)
2012 if (local_type == db.get_val('nettype') and
2013 local_cluster_id == db.get_val('clusterid')):
2014 gw = db.get_val('nid')
2017 debug("find_local_routes: gw is", gw)
2018 for route in router.get_local_routes(local_type, gw):
2019 local_routes.append(route)
2020 debug("find_local_routes:", local_routes)
2023 def choose_local_server(srv_list):
2024 for srv in srv_list:
2025 if local_cluster(srv.net_type, srv.cluster_id):
2028 def local_cluster(net_type, cluster_id):
2029 for cluster in local_clusters:
2030 if net_type == cluster[0] and cluster_id == cluster[1]:
2034 def local_interface(net_type, cluster_id, nid):
2035 for cluster in local_clusters:
2036 if (net_type == cluster[0] and cluster_id == cluster[1]
2037 and nid == cluster[2]):
2041 def find_route(srv_list):
2043 frm_type = local_clusters[0][0]
2044 for srv in srv_list:
2045 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2046 to_type = srv.net_type
2048 cluster_id = srv.cluster_id
2049 debug ('looking for route to', to_type, to)
2050 for r in local_routes:
2051 debug("find_route: ", r)
2052 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2053 result.append((srv, r))
2056 def get_active_target(db):
2057 target_uuid = db.getUUID()
2058 target_name = db.getName()
2059 node_name = get_select(target_name)
2061 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2063 tgt_dev_uuid = db.get_first_ref('active')
2066 def get_server_by_nid_uuid(db, nid_uuid):
2067 for n in db.lookup_class("network"):
2069 if net.nid_uuid == nid_uuid:
2073 ############################################################
2077 type = db.get_class()
2078 debug('Service:', type, db.getName(), db.getUUID())
2083 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2084 elif type == 'network':
2086 elif type == 'routetbl':
2090 elif type == 'cobd':
2092 elif type == 'mdsdev':
2094 elif type == 'mountpoint':
2096 elif type == 'echoclient':
2098 elif type == 'mgmt':
2101 panic ("unknown service type:", type)
2105 # Prepare the system to run lustre using a particular profile
2106 # in a the configuration.
2107 # * load & the modules
2108 # * setup networking for the current node
2109 # * make sure partitions are in place and prepared
2110 # * initialize devices with lctl
2111 # Levels is important, and needs to be enforced.
2112 def for_each_profile(db, prof_list, operation):
2113 for prof_uuid in prof_list:
2114 prof_db = db.lookup(prof_uuid)
2116 panic("profile:", profile, "not found.")
2117 services = getServices(prof_db)
2120 def doWriteconf(services):
2124 if s[1].get_class() == 'mdsdev':
2125 n = newService(s[1])
2128 def doSetup(services):
2132 n = newService(s[1])
2135 def doModules(services):
2139 n = newService(s[1])
2142 def doCleanup(services):
2147 n = newService(s[1])
2148 if n.safe_to_clean():
2151 def doUnloadModules(services):
2156 n = newService(s[1])
2157 if n.safe_to_clean_modules():
2162 def doHost(lustreDB, hosts):
2163 global is_router, local_node_name
2166 node_db = lustreDB.lookup_name(h, 'node')
2170 print 'No host entry found.'
2173 local_node_name = node_db.get_val('name', 0)
2174 is_router = node_db.get_val_int('router', 0)
2175 lustre_upcall = node_db.get_val('lustreUpcall', '')
2176 portals_upcall = node_db.get_val('portalsUpcall', '')
2177 timeout = node_db.get_val_int('timeout', 0)
2179 find_local_clusters(node_db)
2181 find_local_routes(lustreDB)
2183 # Two step process: (1) load modules, (2) setup lustre
2184 # if not cleaning, load modules first.
2185 prof_list = node_db.get_refs('profile')
2187 if config.write_conf:
2188 for_each_profile(node_db, prof_list, doModules)
2190 for_each_profile(node_db, prof_list, doWriteconf)
2191 for_each_profile(node_db, prof_list, doUnloadModules)
2193 elif config.recover:
2194 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2195 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2196 "--client_uuid <UUID> --conn_uuid <UUID>")
2197 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2199 elif config.cleanup:
2201 # the command line can override this value
2203 # ugly hack, only need to run lctl commands for --dump
2204 if config.lctl_dump or config.record:
2205 for_each_profile(node_db, prof_list, doCleanup)
2208 sys_set_timeout(timeout)
2211 sys_set_lustre_upcall(lustre_upcall)
2212 sys_set_portals_upcall(portals_upcall)
2214 for_each_profile(node_db, prof_list, doCleanup)
2215 for_each_profile(node_db, prof_list, doUnloadModules)
2218 # ugly hack, only need to run lctl commands for --dump
2219 if config.lctl_dump or config.record:
2220 sys_set_timeout(timeout)
2221 sys_set_lustre_upcall(lustre_upcall)
2222 for_each_profile(node_db, prof_list, doSetup)
2226 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2227 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2229 for_each_profile(node_db, prof_list, doModules)
2231 sys_set_debug_path()
2234 script = config.gdb_script
2235 run(lctl.lctl, ' modules >', script)
2237 log ("The GDB module script is in", script)
2238 # pause, so user has time to break and
2241 sys_set_timeout(timeout)
2242 sys_set_lustre_upcall(lustre_upcall)
2243 sys_set_portals_upcall(portals_upcall)
2245 for_each_profile(node_db, prof_list, doSetup)
2247 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2248 tgt = db.lookup(tgt_uuid)
2250 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2251 new_uuid = get_active_target(tgt)
2253 raise Lustre.LconfError("doRecovery: no active target found for: " +
2255 net = choose_local_server(get_ost_net(db, new_uuid))
2257 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2259 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2261 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2263 lctl.disconnect(oldnet)
2264 except CommandError, e:
2265 log("recover: disconnect", nid_uuid, "failed: ")
2270 except CommandError, e:
2271 log("recover: connect failed")
2274 lctl.recover(client_uuid, net.nid_uuid)
2277 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2278 base = os.path.dirname(cmd)
2279 if development_mode():
2280 if not config.lustre:
2281 config.lustre = (os.path.join(base, ".."))
2282 # normalize the portals dir, using command line arg if set
2284 portals_dir = config.portals
2285 dir = os.path.join(config.lustre, portals_dir)
2286 config.portals = dir
2287 debug('config.portals', config.portals)
2288 elif config.lustre and config.portals:
2290 # if --lustre and --portals, normalize portals
2291 # can ignore POTRALS_DIR here, since it is probly useless here
2292 config.portals = os.path.join(config.lustre, config.portals)
2293 debug('config.portals B', config.portals)
2295 def sysctl(path, val):
2296 debug("+ sysctl", path, val)
2300 fp = open(os.path.join('/proc/sys', path), 'w')
2307 def sys_set_debug_path():
2308 sysctl('portals/debug_path', config.debug_path)
2310 def sys_set_lustre_upcall(upcall):
2311 # the command overrides the value in the node config
2312 if config.lustre_upcall:
2313 upcall = config.lustre_upcall
2315 upcall = config.upcall
2317 lctl.set_lustre_upcall(upcall)
2319 def sys_set_portals_upcall(upcall):
2320 # the command overrides the value in the node config
2321 if config.portals_upcall:
2322 upcall = config.portals_upcall
2324 upcall = config.upcall
2326 sysctl('portals/upcall', upcall)
2328 def sys_set_timeout(timeout):
2329 # the command overrides the value in the node config
2330 if config.timeout and config.timeout > 0:
2331 timeout = config.timeout
2332 if timeout != None and timeout > 0:
2333 lctl.set_timeout(timeout)
2335 def sys_tweak_socknal ():
2336 if config.single_socket:
2337 sysctl("socknal/typed", 0)
2339 def sys_optimize_elan ():
2340 run ("echo 0 > /proc/elan/config/eventint_punt_loops")
2342 def sys_set_ptldebug():
2343 if config.ptldebug != None:
2345 val = eval(config.ptldebug, ptldebug_names)
2346 val = "0x%x" % (val,)
2347 sysctl('portals/debug', val)
2348 except NameError, e:
2351 def sys_set_subsystem():
2352 if config.subsystem != None:
2354 val = eval(config.subsystem, subsystem_names)
2355 val = "0x%x" % (val,)
2356 sysctl('portals/subsystem_debug', val)
2357 except NameError, e:
2360 def sys_set_netmem_max(path, max):
2361 debug("setting", path, "to at least", max)
2369 fp = open(path, 'w')
2370 fp.write('%d\n' %(max))
2374 def sys_make_devices():
2375 if not os.access('/dev/portals', os.R_OK):
2376 run('mknod /dev/portals c 10 240')
2377 if not os.access('/dev/obd', os.R_OK):
2378 run('mknod /dev/obd c 10 241')
2381 # Add dir to the global PATH, if not already there.
2382 def add_to_path(new_dir):
2383 syspath = string.split(os.environ['PATH'], ':')
2384 if new_dir in syspath:
2386 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2388 def default_debug_path():
2389 path = '/tmp/lustre-log'
2390 if os.path.isdir('/r'):
2395 def default_gdb_script():
2396 script = '/tmp/ogdb'
2397 if os.path.isdir('/r'):
2398 return '/r' + script
2403 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2404 # ensure basic elements are in the system path
2405 def sanitise_path():
2406 for dir in DEFAULT_PATH:
2409 # global hack for the --select handling
2411 def init_select(args):
2412 # args = [service=nodeA,service2=nodeB service3=nodeC]
2415 list = string.split(arg, ',')
2417 srv, node = string.split(entry, '=')
2418 tgt_select[srv] = node
2420 def get_select(srv):
2421 if tgt_select.has_key(srv):
2422 return tgt_select[srv]
2426 FLAG = Lustre.Options.FLAG
2427 PARAM = Lustre.Options.PARAM
2428 INTPARAM = Lustre.Options.INTPARAM
2429 PARAMLIST = Lustre.Options.PARAMLIST
2431 ('verbose,v', "Print system commands as they are run"),
2432 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2433 ('config', "Cluster config name used for LDAP query", PARAM),
2434 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2435 ('node', "Load config for <nodename>", PARAM),
2436 ('cleanup,d', "Cleans up config. (Shutdown)"),
2437 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2439 ('single_socket', "socknal option: only use one socket instead of bundle",
2441 ('failover',"""Used to shut down without saving state.
2442 This will allow this node to "give up" a service to a
2443 another node for failover purposes. This will not
2444 be a clean shutdown.""",
2446 ('gdb', """Prints message after creating gdb module script
2447 and sleeps for 5 seconds."""),
2448 ('noexec,n', """Prints the commands and steps that will be run for a
2449 config without executing them. This can used to check if a
2450 config file is doing what it should be doing"""),
2451 ('nomod', "Skip load/unload module step."),
2452 ('nosetup', "Skip device setup/cleanup step."),
2453 ('reformat', "Reformat all devices (without question)"),
2454 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2455 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2457 ('write_conf', "Save all the client config information on mds."),
2458 ('record', "Write config information on mds."),
2459 ('record_log', "Name of config record log.", PARAM),
2460 ('record_device', "MDS device name that will record the config commands",
2462 ('minlevel', "Minimum level of services to configure/cleanup",
2464 ('maxlevel', """Maximum level of services to configure/cleanup
2465 Levels are aproximatly like:
2470 70 - mountpoint, echo_client, osc, mdc, lov""",
2472 ('lustre', """Base directory of lustre sources. This parameter will
2473 cause lconf to load modules from a source tree.""", PARAM),
2474 ('portals', """Portals source directory. If this is a relative path,
2475 then it is assumed to be relative to lustre. """, PARAM),
2476 ('timeout', "Set recovery timeout", INTPARAM),
2477 ('upcall', "Set both portals and lustre upcall script", PARAM),
2478 ('lustre_upcall', "Set lustre upcall script", PARAM),
2479 ('portals_upcall', "Set portals upcall script", PARAM),
2480 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2481 ('ptldebug', "Set the portals debug level", PARAM),
2482 ('subsystem', "Set the portals debug subsystem", PARAM),
2483 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2484 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2485 # Client recovery options
2486 ('recover', "Recover a device"),
2487 ('group', "The group of devices to configure or cleanup", PARAM),
2488 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2489 ('client_uuid', "The failed client (required for recovery)", PARAM),
2490 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2492 ('inactive', """The name of an inactive service, to be ignored during
2493 mounting (currently OST-only). Can be repeated.""",
2498 global lctl, config, toplevel, CONFIG_FILE
2500 # in the upcall this is set to SIG_IGN
2501 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2503 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2505 config, args = cl.parse(sys.argv[1:])
2506 except Lustre.OptionError, e:
2510 setupModulePath(sys.argv[0])
2512 host = socket.gethostname()
2514 # the PRNG is normally seeded with time(), which is not so good for starting
2515 # time-synchronized clusters
2516 input = open('/dev/urandom', 'r')
2518 print 'Unable to open /dev/urandom!'
2520 seed = input.read(32)
2526 init_select(config.select)
2529 if not os.access(args[0], os.R_OK):
2530 print 'File not found or readable:', args[0]
2533 dom = xml.dom.minidom.parse(args[0])
2535 panic("%s does not appear to be a config file." % (args[0]))
2536 sys.exit(1) # make sure to die here, even in debug mode.
2537 CONFIG_FILE = args[0]
2538 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2539 if not config.config:
2540 config.config = os.path.basename(args[0])# use full path?
2541 if config.config[-4:] == '.xml':
2542 config.config = config.config[:-4]
2543 elif config.ldapurl:
2544 if not config.config:
2545 panic("--ldapurl requires --config name")
2546 dn = "config=%s,fs=lustre" % (config.config)
2547 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2549 print 'Missing config file or ldap URL.'
2550 print 'see lconf --help for command summary'
2555 ver = db.get_version()
2557 panic("No version found in config data, please recreate.")
2558 if ver != Lustre.CONFIG_VERSION:
2559 panic("Config version", ver, "does not match lconf version",
2560 Lustre.CONFIG_VERSION)
2564 node_list.append(config.node)
2567 node_list.append(host)
2568 node_list.append('localhost')
2570 debug("configuring for host: ", node_list)
2573 config.debug_path = config.debug_path + '-' + host
2574 config.gdb_script = config.gdb_script + '-' + host
2576 lctl = LCTLInterface('lctl')
2578 if config.lctl_dump:
2579 lctl.use_save_file(config.lctl_dump)
2582 if not (config.record_device and config.record_log):
2583 panic("When recording, both --record_log and --record_device must be specified.")
2584 lctl.record(config.record_device, config.record_log)
2586 doHost(db, node_list)
2591 if __name__ == "__main__":
2594 except Lustre.LconfError, e:
2596 # traceback.print_exc(file=sys.stdout)
2598 except CommandError, e:
2602 if first_cleanup_error:
2603 sys.exit(first_cleanup_error)