3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
91 "undefined" : (0 << 24),
100 "ext2obd" : (9 << 24),
101 "portals" : (10 << 24),
102 "socknal" : (11 << 24),
103 "qswnal" : (12 << 24),
104 "pinger" : (13 << 24),
105 "filter" : (14 << 24),
106 "trace" : (15 << 24),
110 "gmnal" : (19 << 24),
111 "ptlrouter" : (20 << 24),
113 "ptlbd" : (22 << 24),
119 first_cleanup_error = 0
120 def cleanup_error(rc):
121 global first_cleanup_error
122 if not first_cleanup_error:
123 first_cleanup_error = rc
125 # ============================================================
126 # debugging and error funcs
128 def fixme(msg = "this feature"):
129 raise Lustre.LconfError, msg + ' not implmemented yet.'
132 msg = string.join(map(str,args))
133 if not config.noexec:
134 raise Lustre.LconfError(msg)
139 msg = string.join(map(str,args))
144 print string.strip(s)
148 msg = string.join(map(str,args))
151 # ack, python's builtin int() does not support '0x123' syntax.
152 # eval can do it, although what a hack!
156 return eval(s, {}, {})
159 except SyntaxError, e:
160 raise ValueError("not a number")
162 raise ValueError("not a number")
164 # ============================================================
165 # locally defined exceptions
166 class CommandError (exceptions.Exception):
167 def __init__(self, cmd_name, cmd_err, rc=None):
168 self.cmd_name = cmd_name
169 self.cmd_err = cmd_err
174 if type(self.cmd_err) == types.StringType:
176 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
178 print "! %s: %s" % (self.cmd_name, self.cmd_err)
179 elif type(self.cmd_err) == types.ListType:
181 print "! %s (error %d):" % (self.cmd_name, self.rc)
183 print "! %s:" % (self.cmd_name)
184 for s in self.cmd_err:
185 print "> %s" %(string.strip(s))
190 # ============================================================
191 # handle daemons, like the acceptor
193 """ Manage starting and stopping a daemon. Assumes daemon manages
194 it's own pid file. """
196 def __init__(self, cmd):
202 log(self.command, "already running.")
204 self.path = find_prog(self.command)
206 panic(self.command, "not found.")
207 ret, out = runcmd(self.path +' '+ self.command_line())
209 raise CommandError(self.path, out, ret)
213 pid = self.read_pidfile()
215 log ("killing process", pid)
217 #time.sleep(1) # let daemon die
219 log("unable to kill", self.command, e)
221 log("unable to kill", self.command)
224 pid = self.read_pidfile()
234 def read_pidfile(self):
236 fp = open(self.pidfile(), 'r')
243 def clean_pidfile(self):
244 """ Remove a stale pidfile """
245 log("removing stale pidfile:", self.pidfile())
247 os.unlink(self.pidfile())
249 log(self.pidfile(), e)
251 class AcceptorHandler(DaemonHandler):
252 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
253 DaemonHandler.__init__(self, "acceptor")
256 self.send_mem = send_mem
257 self.recv_mem = recv_mem
260 self.flags = self.flags + ' -i'
263 return "/var/run/%s-%d.pid" % (self.command, self.port)
265 def command_line(self):
266 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
270 # start the acceptors
272 if config.lctl_dump or config.record:
274 for port in acceptors.keys():
275 daemon = acceptors[port]
276 if not daemon.running():
279 def run_one_acceptor(port):
280 if config.lctl_dump or config.record:
282 if acceptors.has_key(port):
283 daemon = acceptors[port]
284 if not daemon.running():
287 panic("run_one_acceptor: No acceptor defined for port:", port)
289 def stop_acceptor(port):
290 if acceptors.has_key(port):
291 daemon = acceptors[port]
296 # ============================================================
297 # handle lctl interface
300 Manage communication with lctl
303 def __init__(self, cmd):
305 Initialize close by finding the lctl binary.
307 self.lctl = find_prog(cmd)
309 self.record_device = ''
312 debug('! lctl not found')
315 raise CommandError('lctl', "unable to find lctl binary.")
317 def use_save_file(self, file):
318 self.save_file = file
320 def record(self, dev_name, logname):
321 log("Recording log", logname, "on", dev_name)
322 self.record_device = dev_name
323 self.record_log = logname
325 def end_record(self):
326 log("End recording log", self.record_log, "on", self.record_device)
327 self.record_device = None
328 self.record_log = None
330 def set_nonblock(self, fd):
331 fl = fcntl.fcntl(fd, F_GETFL)
332 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
337 the cmds are written to stdin of lctl
338 lctl doesn't return errors when run in script mode, so
340 should modify command line to accept multiple commands, or
341 create complex command line options
345 cmds = '\n dump ' + self.save_file + '\n' + cmds
346 elif self.record_device:
350 %s""" % (self.record_device, self.record_log, cmds)
352 debug("+", cmd_line, cmds)
353 if config.noexec: return (0, [])
355 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
356 child.tochild.write(cmds + "\n")
357 child.tochild.close()
359 # From "Python Cookbook" from O'Reilly
360 outfile = child.fromchild
361 outfd = outfile.fileno()
362 self.set_nonblock(outfd)
363 errfile = child.childerr
364 errfd = errfile.fileno()
365 self.set_nonblock(errfd)
367 outdata = errdata = ''
370 ready = select.select([outfd,errfd],[],[]) # Wait for input
371 if outfd in ready[0]:
372 outchunk = outfile.read()
373 if outchunk == '': outeof = 1
374 outdata = outdata + outchunk
375 if errfd in ready[0]:
376 errchunk = errfile.read()
377 if errchunk == '': erreof = 1
378 errdata = errdata + errchunk
379 if outeof and erreof: break
380 # end of "borrowed" code
383 if os.WIFEXITED(ret):
384 rc = os.WEXITSTATUS(ret)
387 if rc or len(errdata):
388 raise CommandError(self.lctl, errdata, rc)
391 def runcmd(self, *args):
393 run lctl using the command line
395 cmd = string.join(map(str,args))
396 debug("+", self.lctl, cmd)
397 rc, out = run(self.lctl, cmd)
399 raise CommandError(self.lctl, out, rc)
403 def network(self, net, nid):
408 quit """ % (net, nid)
411 # create a new connection
412 def add_uuid(self, net_type, uuid, nid):
413 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
416 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
418 if net_type in ('tcp',) and not config.lctl_dump:
423 add_autoconn %s %s %d %s
427 nid, hostaddr, port, flags )
430 def connect(self, srv):
431 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
432 if srv.net_type in ('tcp',) and not config.lctl_dump:
436 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
437 srv.nid, srv.hostaddr, srv.port, flags)
440 def recover(self, dev_name, new_conn):
443 recover %s""" %(dev_name, new_conn)
446 # add a route to a range
447 def add_route(self, net, gw, lo, hi):
455 except CommandError, e:
459 def del_route(self, net, gw, lo, hi):
464 quit """ % (net, gw, lo, hi)
467 # add a route to a host
468 def add_route_host(self, net, uuid, gw, tgt):
469 self.add_uuid(net, uuid, tgt)
477 except CommandError, e:
481 # add a route to a range
482 def del_route_host(self, net, uuid, gw, tgt):
488 quit """ % (net, gw, tgt)
492 def del_autoconn(self, net_type, nid, hostaddr):
493 if net_type in ('tcp',) and not config.lctl_dump:
502 # disconnect one connection
503 def disconnect(self, srv):
504 self.del_uuid(srv.nid_uuid)
505 if srv.net_type in ('tcp',) and not config.lctl_dump:
506 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
508 def del_uuid(self, uuid):
516 def disconnectAll(self, net):
524 def attach(self, type, name, uuid):
527 quit""" % (type, name, uuid)
530 def setup(self, name, setup = ""):
534 quit""" % (name, setup)
538 # create a new device with lctl
539 def newdev(self, type, name, uuid, setup = ""):
540 self.attach(type, name, uuid);
542 self.setup(name, setup)
543 except CommandError, e:
544 self.cleanup(name, uuid, 0)
549 def cleanup(self, name, uuid, force, failover = 0):
550 if failover: force = 1
556 quit""" % (name, ('', 'force')[force],
557 ('', 'failover')[failover])
561 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
562 stripe_sz, stripe_off,
566 lov_setup %s %d %d %d %s %s
567 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
572 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
576 lov_setconfig %s %d %d %d %s %s
577 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
581 def dump(self, dump_file):
584 quit""" % (dump_file)
587 # get list of devices
588 def device_list(self):
589 devices = '/proc/fs/lustre/devices'
591 if os.access(devices, os.R_OK):
593 fp = open(devices, 'r')
601 def lustre_version(self):
602 rc, out = self.runcmd('version')
606 def mount_option(self, profile, osc, mdc):
608 mount_option %s %s %s
609 quit""" % (profile, osc, mdc)
612 # delete mount options
613 def del_mount_option(self, profile):
619 def set_timeout(self, timeout):
625 # delete mount options
626 def set_lustre_upcall(self, upcall):
631 # ============================================================
632 # Various system-level functions
633 # (ideally moved to their own module)
635 # Run a command and return the output and status.
636 # stderr is sent to /dev/null, could use popen3 to
637 # save it if necessary
640 if config.noexec: return (0, [])
641 f = os.popen(cmd + ' 2>&1')
651 cmd = string.join(map(str,args))
654 # Run a command in the background.
655 def run_daemon(*args):
656 cmd = string.join(map(str,args))
658 if config.noexec: return 0
659 f = os.popen(cmd + ' 2>&1')
667 # Determine full path to use for an external command
668 # searches dirname(argv[0]) first, then PATH
670 syspath = string.split(os.environ['PATH'], ':')
671 cmdpath = os.path.dirname(sys.argv[0])
672 syspath.insert(0, cmdpath);
674 syspath.insert(0, os.path.join(config.portals, 'utils/'))
676 prog = os.path.join(d,cmd)
677 if os.access(prog, os.X_OK):
681 # Recursively look for file starting at base dir
682 def do_find_file(base, mod):
683 fullname = os.path.join(base, mod)
684 if os.access(fullname, os.R_OK):
686 for d in os.listdir(base):
687 dir = os.path.join(base,d)
688 if os.path.isdir(dir):
689 module = do_find_file(dir, mod)
693 def find_module(src_dir, dev_dir, modname):
694 mod = '%s.o' % (modname)
695 module = src_dir +'/'+ dev_dir +'/'+ mod
697 if os.access(module, os.R_OK):
703 # is the path a block device?
710 return stat.S_ISBLK(s[stat.ST_MODE])
712 # build fs according to type
714 def mkfs(dev, devsize, fstype, jsize, mkfsoptions, isblock=1):
719 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
721 # devsize is in 1k, and fs block count is in 4k
722 block_cnt = devsize/4
724 if fstype in ('ext3', 'extN'):
725 # ext3 journal size is in megabytes
726 if jsize: jopt = "-J size=%d" %(jsize,)
727 mkfs = 'mkfs.ext2 -j -b 4096 '
728 if not isblock or config.force:
730 elif fstype == 'reiserfs':
731 # reiserfs journal size is in blocks
732 if jsize: jopt = "--journal_size %d" %(jsize,)
733 mkfs = 'mkreiserfs -ff'
735 panic('unsupported fs type: ', fstype)
737 if config.mkfsoptions != None:
738 mkfs = mkfs + ' ' + config.mkfsoptions
739 if mkfsoptions != None:
740 mkfs = mkfs + ' ' + mkfsoptions
741 (ret, out) = run (mkfs, jopt, dev, block_cnt)
743 panic("Unable to build fs:", dev, string.join(out))
744 # enable hash tree indexing on fsswe
745 if fstype in ('ext3', 'extN'):
746 htree = 'echo "feature FEATURE_C5" | debugfs -w'
747 (ret, out) = run (htree, dev)
749 panic("Unable to enable htree:", dev)
751 # some systems use /dev/loopN, some /dev/loop/N
755 if not os.access(loop + str(0), os.R_OK):
757 if not os.access(loop + str(0), os.R_OK):
758 panic ("can't access loop devices")
761 # find loop device assigned to thefile
764 for n in xrange(0, MAX_LOOP_DEVICES):
766 if os.access(dev, os.R_OK):
767 (stat, out) = run('losetup', dev)
768 if out and stat == 0:
769 m = re.search(r'\((.*)\)', out[0])
770 if m and file == m.group(1):
776 # create file if necessary and assign the first free loop device
777 def init_loop(file, size, fstype, journal_size, mkfsoptions, reformat):
778 dev = find_loop(file)
780 print 'WARNING file:', file, 'already mapped to', dev
782 if reformat or not os.access(file, os.R_OK | os.W_OK):
784 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
785 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
788 panic("Unable to create backing store:", file)
789 mkfs(file, size, fstype, journal_size, mkfsoptions, isblock=0)
792 # find next free loop
793 for n in xrange(0, MAX_LOOP_DEVICES):
795 if os.access(dev, os.R_OK):
796 (stat, out) = run('losetup', dev)
798 run('losetup', dev, file)
801 print "out of loop devices"
803 print "out of loop devices"
806 # undo loop assignment
807 def clean_loop(file):
808 dev = find_loop(file)
810 ret, out = run('losetup -d', dev)
812 log('unable to clean loop device:', dev, 'for file:', file)
815 # determine if dev is formatted as a <fstype> filesystem
816 def need_format(fstype, dev):
817 # FIXME don't know how to implement this
820 # initialize a block device if needed
821 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
823 if config.noexec: return dev
824 if not is_block(dev):
825 dev = init_loop(dev, size, fstype, journal_size, mkfsoptions, reformat)
826 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
827 mkfs(dev, size, fstype, journal_size, mkfsoptions, isblock=0)
830 # panic("device:", dev,
831 # "not prepared, and autoformat is not set.\n",
832 # "Rerun with --reformat option to format ALL filesystems")
837 """lookup IP address for an interface"""
838 rc, out = run("/sbin/ifconfig", iface)
841 addr = string.split(out[1])[1]
842 ip = string.split(addr, ':')[1]
845 def sys_get_local_nid(net_type, wildcard, cluster_id):
846 """Return the local nid."""
848 if os.access('/proc/elan/device0/position', os.R_OK):
849 local = sys_get_local_address('elan', '*', cluster_id)
851 local = sys_get_local_address(net_type, wildcard, cluster_id)
854 def sys_get_local_address(net_type, wildcard, cluster_id):
855 """Return the local address for the network type."""
857 if net_type in ('tcp',):
859 iface, star = string.split(wildcard, ':')
860 local = if2addr(iface)
862 panic ("unable to determine ip for:", wildcard)
864 host = socket.gethostname()
865 local = socket.gethostbyname(host)
866 elif net_type == 'elan':
867 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
869 fp = open('/proc/elan/device0/position', 'r')
870 lines = fp.readlines()
878 nid = my_int(cluster_id) + my_int(elan_id)
880 except ValueError, e:
884 elif net_type == 'gm':
885 fixme("automatic local address for GM")
886 elif net_type == 'scimac':
887 scinode="/opt/scali/sbin/scinode"
888 if os.path.exists(scinode):
889 (rc,local) = run(scinode)
891 panic (scinode, " not found on node with scimac networking")
893 panic (scinode, " failed")
894 local=string.rstrip(local[0])
898 def mod_loaded(modname):
899 """Check if a module is already loaded. Look in /proc/modules for it."""
901 fp = open('/proc/modules')
902 lines = fp.readlines()
904 # please forgive my tired fingers for this one
905 ret = filter(lambda word, mod=modname: word == mod,
906 map(lambda line: string.split(line)[0], lines))
911 # XXX: instead of device_list, ask for $name and see what we get
912 def is_prepared(name):
913 """Return true if a device exists for the name"""
916 if (config.noexec or config.record) and config.cleanup:
919 # expect this format:
920 # 1 UP ldlm ldlm ldlm_UUID 2
921 out = lctl.device_list()
923 if name == string.split(s)[3]:
925 except CommandError, e:
929 def is_network_prepared():
930 """If the any device exists, then assume that all networking
931 has been configured"""
932 out = lctl.device_list()
935 def fs_is_mounted(path):
936 """Return true if path is a mounted lustre filesystem"""
938 fp = open('/proc/mounts')
939 lines = fp.readlines()
943 if a[1] == path and a[2] == 'lustre_lite':
951 """Manage kernel modules"""
952 def __init__(self, lustre_dir, portals_dir):
953 self.lustre_dir = lustre_dir
954 self.portals_dir = portals_dir
955 self.kmodule_list = []
957 def add_portals_module(self, dev_dir, modname):
958 """Append a module to list of modules to load."""
959 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
961 def add_lustre_module(self, dev_dir, modname):
962 """Append a module to list of modules to load."""
963 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
965 def load_module(self):
966 """Load all the modules in the list in the order they appear."""
967 for src_dir, dev_dir, mod in self.kmodule_list:
968 if mod_loaded(mod) and not config.noexec:
970 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
972 module = find_module(src_dir, dev_dir, mod)
974 panic('module not found:', mod)
975 (rc, out) = run('/sbin/insmod', module)
977 raise CommandError('insmod', out, rc)
979 (rc, out) = run('/sbin/modprobe', mod)
981 raise CommandError('modprobe', out, rc)
983 def cleanup_module(self):
984 """Unload the modules in the list in reverse order."""
985 rev = self.kmodule_list
987 for src_dir, dev_dir, mod in rev:
988 if not mod_loaded(mod) and not config.noexec:
991 if mod == 'portals' and config.dump:
992 lctl.dump(config.dump)
993 log('unloading module:', mod)
994 (rc, out) = run('/sbin/rmmod', mod)
996 log('! unable to unload module:', mod)
999 # ============================================================
1000 # Classes to prepare and cleanup the various objects
1003 """ Base class for the rest of the modules. The default cleanup method is
1004 defined here, as well as some utilitiy funcs.
1006 def __init__(self, module_name, db):
1008 self.module_name = module_name
1009 self.name = self.db.getName()
1010 self.uuid = self.db.getUUID()
1013 self.kmod = kmod(config.lustre, config.portals)
1015 def info(self, *args):
1016 msg = string.join(map(str,args))
1017 print self.module_name + ":", self.name, self.uuid, msg
1020 """ default cleanup, used for most modules """
1023 lctl.cleanup(self.name, self.uuid, config.force)
1024 except CommandError, e:
1025 log(self.module_name, "cleanup failed: ", self.name)
1029 def add_portals_module(self, dev_dir, modname):
1030 """Append a module to list of modules to load."""
1031 self.kmod.add_portals_module(dev_dir, modname)
1033 def add_lustre_module(self, dev_dir, modname):
1034 """Append a module to list of modules to load."""
1035 self.kmod.add_lustre_module(dev_dir, modname)
1037 def load_module(self):
1038 """Load all the modules in the list in the order they appear."""
1039 self.kmod.load_module()
1041 def cleanup_module(self):
1042 """Unload the modules in the list in reverse order."""
1043 if self.safe_to_clean():
1044 self.kmod.cleanup_module()
1046 def safe_to_clean(self):
1049 def safe_to_clean_modules(self):
1050 return self.safe_to_clean()
1052 class Network(Module):
1053 def __init__(self,db):
1054 Module.__init__(self, 'NETWORK', db)
1055 self.net_type = self.db.get_val('nettype')
1056 self.nid = self.db.get_val('nid', '*')
1057 self.cluster_id = self.db.get_val('clusterid', "0")
1058 self.port = self.db.get_val_int('port', 0)
1059 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1060 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1061 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1064 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1066 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1067 self.generic_nid = 1
1068 debug("nid:", self.nid)
1070 self.generic_nid = 0
1072 self.nid_uuid = self.nid_to_uuid(self.nid)
1074 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1075 if '*' in self.hostaddr:
1076 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1077 if not self.hostaddr:
1078 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1079 debug("hostaddr:", self.hostaddr)
1081 self.add_portals_module("libcfs", 'portals')
1082 if node_needs_router():
1083 self.add_portals_module("router", 'kptlrouter')
1084 if self.net_type == 'tcp':
1085 self.add_portals_module("knals/socknal", 'ksocknal')
1086 if self.net_type == 'elan':
1087 self.add_portals_module("knals/qswnal", 'kqswnal')
1088 if self.net_type == 'gm':
1089 self.add_portals_module("knals/gmnal", 'kgmnal')
1090 if self.net_type == 'scimac':
1091 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1093 def nid_to_uuid(self, nid):
1094 return "NID_%s_UUID" %(nid,)
1097 if is_network_prepared():
1099 self.info(self.net_type, self.nid, self.port)
1100 if not (config.record and self.generic_nid):
1101 lctl.network(self.net_type, self.nid)
1102 if self.net_type == 'tcp':
1104 if self.net_type == 'elan':
1106 if self.port and node_is_router():
1107 run_one_acceptor(self.port)
1108 self.connect_peer_gateways()
1110 def connect_peer_gateways(self):
1111 for router in self.db.lookup_class('node'):
1112 if router.get_val_int('router', 0):
1113 for netuuid in router.get_networks():
1114 net = self.db.lookup(netuuid)
1116 if (gw.cluster_id == self.cluster_id and
1117 gw.net_type == self.net_type):
1118 if gw.nid != self.nid:
1121 def disconnect_peer_gateways(self):
1122 for router in self.db.lookup_class('node'):
1123 if router.get_val_int('router', 0):
1124 for netuuid in router.get_networks():
1125 net = self.db.lookup(netuuid)
1127 if (gw.cluster_id == self.cluster_id and
1128 gw.net_type == self.net_type):
1129 if gw.nid != self.nid:
1132 except CommandError, e:
1133 print "disconnect failed: ", self.name
1137 def safe_to_clean(self):
1138 return not is_network_prepared()
1141 self.info(self.net_type, self.nid, self.port)
1143 stop_acceptor(self.port)
1144 if node_is_router():
1145 self.disconnect_peer_gateways()
1147 class RouteTable(Module):
1148 def __init__(self,db):
1149 Module.__init__(self, 'ROUTES', db)
1151 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1153 # only setup connections for tcp NALs
1155 if not net_type in ('tcp',):
1158 # connect to target if route is to single node and this node is the gw
1159 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1160 if not local_cluster(net_type, tgt_cluster_id):
1161 panic("target", lo, " not on the local cluster")
1162 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1163 # connect to gateway if this node is not the gw
1164 elif (local_cluster(net_type, gw_cluster_id)
1165 and not local_interface(net_type, gw_cluster_id, gw)):
1166 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1171 panic("no server for nid", lo)
1174 return Network(srvdb)
1177 if is_network_prepared():
1180 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1181 lctl.add_route(net_type, gw, lo, hi)
1182 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1186 def safe_to_clean(self):
1187 return not is_network_prepared()
1190 if is_network_prepared():
1191 # the network is still being used, don't clean it up
1193 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1194 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1197 lctl.disconnect(srv)
1198 except CommandError, e:
1199 print "disconnect failed: ", self.name
1204 lctl.del_route(net_type, gw, lo, hi)
1205 except CommandError, e:
1206 print "del_route failed: ", self.name
1210 class Management(Module):
1211 def __init__(self, db):
1212 Module.__init__(self, 'MGMT', db)
1213 self.add_lustre_module('lvfs', 'lvfs')
1214 self.add_lustre_module('obdclass', 'obdclass')
1215 self.add_lustre_module('ptlrpc', 'ptlrpc')
1216 self.add_lustre_module('mgmt', 'mgmt_svc')
1219 if is_prepared(self.name):
1222 lctl.newdev("mgmt", self.name, self.uuid)
1224 def safe_to_clean(self):
1228 if is_prepared(self.name):
1229 Module.cleanup(self)
1231 # This is only needed to load the modules; the LDLM device
1232 # is now created automatically.
1234 def __init__(self,db):
1235 Module.__init__(self, 'LDLM', db)
1236 self.add_lustre_module('lvfs', 'lvfs')
1237 self.add_lustre_module('obdclass', 'obdclass')
1238 self.add_lustre_module('ptlrpc', 'ptlrpc')
1247 def __init__(self, db, uuid, fs_name, name_override = None):
1248 Module.__init__(self, 'LOV', db)
1249 if name_override != None:
1250 self.name = "lov_%s" % name_override
1251 self.add_lustre_module('lov', 'lov')
1252 self.mds_uuid = self.db.get_first_ref('mds')
1253 mds= self.db.lookup(self.mds_uuid)
1254 self.mds_name = mds.getName()
1255 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1256 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1257 self.pattern = self.db.get_val_int('stripepattern', 0)
1258 self.devlist = self.db.get_refs('obd')
1259 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1261 self.desc_uuid = self.uuid
1262 self.uuid = generate_client_uuid(self.name)
1263 self.fs_name = fs_name
1264 for obd_uuid in self.devlist:
1265 obd = self.db.lookup(obd_uuid)
1266 osc = get_osc(obd, self.uuid, fs_name)
1268 self.osclist.append(osc)
1270 panic('osc not found:', obd_uuid)
1273 if is_prepared(self.name):
1275 for osc in self.osclist:
1277 # Only ignore connect failures with --force, which
1278 # isn't implemented here yet.
1279 osc.prepare(ignore_connect_failure=0)
1280 except CommandError, e:
1281 print "Error preparing OSC %s\n" % osc.uuid
1283 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1284 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1285 lctl.lov_setup(self.name, self.uuid,
1286 self.desc_uuid, self.mds_name, self.stripe_cnt,
1287 self.stripe_sz, self.stripe_off, self.pattern,
1288 string.join(self.devlist))
1291 if is_prepared(self.name):
1292 Module.cleanup(self)
1293 for osc in self.osclist:
1296 def load_module(self):
1297 for osc in self.osclist:
1300 Module.load_module(self)
1302 def cleanup_module(self):
1303 Module.cleanup_module(self)
1304 for osc in self.osclist:
1305 osc.cleanup_module()
1308 class MDSDEV(Module):
1309 def __init__(self,db):
1310 Module.__init__(self, 'MDSDEV', db)
1311 self.devpath = self.db.get_val('devpath','')
1312 self.size = self.db.get_val_int('devsize', 0)
1313 self.journal_size = self.db.get_val_int('journalsize', 0)
1314 self.fstype = self.db.get_val('fstype', '')
1315 self.nspath = self.db.get_val('nspath', '')
1316 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1317 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1318 target_uuid = self.db.get_first_ref('target')
1319 mds = self.db.lookup(target_uuid)
1320 self.name = mds.getName()
1321 self.filesystem_uuids = mds.get_refs('filesystem')
1322 # FIXME: if fstype not set, then determine based on kernel version
1323 self.format = self.db.get_val('autoformat', "no")
1324 if mds.get_val('failover', 0):
1325 self.failover_mds = 'f'
1327 self.failover_mds = 'n'
1328 active_uuid = get_active_target(mds)
1330 panic("No target device found:", target_uuid)
1331 if active_uuid == self.uuid:
1335 if self.active and config.group and config.group != ost.get_val('group'):
1338 self.target_dev_uuid = self.uuid
1339 self.uuid = target_uuid
1341 self.add_lustre_module('mdc', 'mdc')
1342 self.add_lustre_module('osc', 'osc')
1343 self.add_lustre_module('lov', 'lov')
1344 self.add_lustre_module('mds', 'mds')
1346 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1349 def load_module(self):
1351 Module.load_module(self)
1354 if is_prepared(self.name):
1357 debug(self.uuid, "not active")
1360 # run write_conf automatically, if --reformat used
1362 self.info(self.devpath, self.fstype, self.size, self.format)
1364 # never reformat here
1365 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1366 self.format, self.journal_size, self.mkfsoptions)
1367 if not is_prepared('MDT'):
1368 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1370 lctl.newdev("mds", self.name, self.uuid,
1371 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1372 except CommandError, e:
1374 panic("MDS is missing the config log. Need to run " +
1375 "lconf --write_conf.")
1379 def write_conf(self):
1380 if is_prepared(self.name):
1382 self.info(self.devpath, self.fstype, self.format)
1383 blkdev = block_dev(self.devpath, self.size, self.fstype,
1384 config.reformat, self.format, self.journal_size,
1386 lctl.newdev("mds", self.name, self.uuid,
1387 setup ="%s %s" %(blkdev, self.fstype))
1389 # record logs for the MDS lov
1390 for uuid in self.filesystem_uuids:
1391 log("recording clients for filesystem:", uuid)
1392 fs = self.db.lookup(uuid)
1393 obd_uuid = fs.get_first_ref('obd')
1394 client_uuid = generate_client_uuid(self.name)
1395 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1398 lctl.record(self.name, self.name)
1400 lctl.mount_option(self.name, client.get_name(), "")
1404 lctl.record(self.name, self.name + '-clean')
1406 lctl.del_mount_option(self.name)
1411 # record logs for each client
1413 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1415 config_options = CONFIG_FILE
1417 for node_db in self.db.lookup_class('node'):
1418 client_name = node_db.getName()
1419 for prof_uuid in node_db.get_refs('profile'):
1420 prof_db = node_db.lookup(prof_uuid)
1421 # refactor this into a funtion to test "clientness"
1423 for ref_class, ref_uuid in prof_db.get_all_refs():
1424 if ref_class in ('mountpoint','echoclient'):
1425 debug("recording", client_name)
1426 old_noexec = config.noexec
1428 noexec_opt = ('', '-n')
1429 ret, out = run (sys.argv[0],
1430 noexec_opt[old_noexec == 1],
1431 " -v --record --nomod",
1432 "--record_log", client_name,
1433 "--record_device", self.name,
1434 "--node", client_name,
1437 for s in out: log("record> ", string.strip(s))
1438 ret, out = run (sys.argv[0],
1439 noexec_opt[old_noexec == 1],
1440 "--cleanup -v --record --nomod",
1441 "--record_log", client_name + "-clean",
1442 "--record_device", self.name,
1443 "--node", client_name,
1446 for s in out: log("record> ", string.strip(s))
1447 config.noexec = old_noexec
1449 lctl.cleanup(self.name, self.uuid, 0, 0)
1450 except CommandError, e:
1451 log(self.module_name, "cleanup failed: ", self.name)
1454 Module.cleanup(self)
1455 clean_loop(self.devpath)
1457 def msd_remaining(self):
1458 out = lctl.device_list()
1460 if string.split(s)[2] in ('mds',):
1463 def safe_to_clean(self):
1466 def safe_to_clean_modules(self):
1467 return not self.msd_remaining()
1471 debug(self.uuid, "not active")
1474 if is_prepared(self.name):
1476 lctl.cleanup(self.name, self.uuid, config.force,
1478 except CommandError, e:
1479 log(self.module_name, "cleanup failed: ", self.name)
1482 Module.cleanup(self)
1483 if not self.msd_remaining() and is_prepared('MDT'):
1485 lctl.cleanup("MDT", "MDT_UUID", config.force,
1487 except CommandError, e:
1488 print "cleanup failed: ", self.name
1491 clean_loop(self.devpath)
1494 def __init__(self, db):
1495 Module.__init__(self, 'OSD', db)
1496 self.osdtype = self.db.get_val('osdtype')
1497 self.devpath = self.db.get_val('devpath', '')
1498 self.size = self.db.get_val_int('devsize', 0)
1499 self.journal_size = self.db.get_val_int('journalsize', 0)
1500 self.mkfsoptions = self.db.get_val_int('mkfsoptions', '')
1501 self.fstype = self.db.get_val('fstype', '')
1502 self.nspath = self.db.get_val('nspath', '')
1503 target_uuid = self.db.get_first_ref('target')
1504 ost = self.db.lookup(target_uuid)
1505 self.name = ost.getName()
1506 self.format = self.db.get_val('autoformat', 'yes')
1507 if ost.get_val('failover', 0):
1508 self.failover_ost = 'f'
1510 self.failover_ost = 'n'
1512 active_uuid = get_active_target(ost)
1514 panic("No target device found:", target_uuid)
1515 if active_uuid == self.uuid:
1519 if self.active and config.group and config.group != ost.get_val('group'):
1522 self.target_dev_uuid = self.uuid
1523 self.uuid = target_uuid
1525 self.add_lustre_module('ost', 'ost')
1526 # FIXME: should we default to ext3 here?
1528 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1529 self.add_lustre_module(self.osdtype, self.osdtype)
1531 def load_module(self):
1533 Module.load_module(self)
1535 # need to check /proc/mounts and /etc/mtab before
1536 # formatting anything.
1537 # FIXME: check if device is already formatted.
1539 if is_prepared(self.name):
1542 debug(self.uuid, "not active")
1544 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1545 self.format, self.journal_size)
1547 if self.osdtype == 'obdecho':
1550 blkdev = block_dev(self.devpath, self.size, self.fstype,
1551 config.reformat, self.format, self.journal_size,
1553 lctl.newdev(self.osdtype, self.name, self.uuid,
1554 setup ="%s %s %s" %(blkdev, self.fstype,
1556 if not is_prepared('OSS'):
1557 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1559 def osd_remaining(self):
1560 out = lctl.device_list()
1562 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1565 def safe_to_clean(self):
1568 def safe_to_clean_modules(self):
1569 return not self.osd_remaining()
1573 debug(self.uuid, "not active")
1575 if is_prepared(self.name):
1578 lctl.cleanup(self.name, self.uuid, config.force,
1580 except CommandError, e:
1581 log(self.module_name, "cleanup failed: ", self.name)
1584 if not self.osd_remaining() and is_prepared('OSS'):
1586 lctl.cleanup("OSS", "OSS_UUID", config.force,
1588 except CommandError, e:
1589 print "cleanup failed: ", self.name
1592 if not self.osdtype == 'obdecho':
1593 clean_loop(self.devpath)
1595 def mgmt_uuid_for_fs(mtpt_name):
1598 mtpt_db = toplevel.lookup_name(mtpt_name)
1599 fs_uuid = mtpt_db.get_first_ref('filesystem')
1600 fs = toplevel.lookup(fs_uuid)
1603 return fs.get_first_ref('mgmt')
1605 # Generic client module, used by OSC and MDC
1606 class Client(Module):
1607 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1609 self.target_name = tgtdb.getName()
1610 self.target_uuid = tgtdb.getUUID()
1613 self.tgt_dev_uuid = get_active_target(tgtdb)
1614 if not self.tgt_dev_uuid:
1615 panic("No target device found for target:", self.target_name)
1617 self.kmod = kmod(config.lustre, config.portals)
1621 self.module = module
1622 self.module_name = string.upper(module)
1624 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1625 self.target_name, fs_name)
1627 self.name = self_name
1629 self.lookup_server(self.tgt_dev_uuid)
1630 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1632 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1635 self.fs_name = fs_name
1638 self.add_lustre_module(module_dir, module)
1640 def lookup_server(self, srv_uuid):
1641 """ Lookup a server's network information """
1642 self._server_nets = get_ost_net(self.db, srv_uuid)
1643 if len(self._server_nets) == 0:
1644 panic ("Unable to find a server for:", srv_uuid)
1646 def get_servers(self):
1647 return self._server_nets
1649 def prepare(self, ignore_connect_failure = 0):
1650 self.info(self.target_uuid)
1651 if is_prepared(self.name):
1654 srv = choose_local_server(self.get_servers())
1658 routes = find_route(self.get_servers())
1659 if len(routes) == 0:
1660 panic ("no route to", self.target_uuid)
1661 for (srv, r) in routes:
1662 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1663 except CommandError, e:
1664 if not ignore_connect_failure:
1667 if self.target_uuid in config.inactive and self.permits_inactive():
1668 debug("%s inactive" % self.target_uuid)
1669 inactive_p = "inactive"
1671 debug("%s active" % self.target_uuid)
1673 lctl.newdev(self.module, self.name, self.uuid,
1674 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1675 inactive_p, self.mgmt_name))
1678 if is_prepared(self.name):
1679 Module.cleanup(self)
1681 srv = choose_local_server(self.get_servers())
1683 lctl.disconnect(srv)
1685 for (srv, r) in find_route(self.get_servers()):
1686 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1687 except CommandError, e:
1688 log(self.module_name, "cleanup failed: ", self.name)
1694 def __init__(self, db, uuid, fs_name):
1695 Client.__init__(self, db, uuid, 'mdc', fs_name)
1697 def permits_inactive(self):
1701 def __init__(self, db, uuid, fs_name):
1702 Client.__init__(self, db, uuid, 'osc', fs_name)
1704 def permits_inactive(self):
1707 def mgmtcli_name_for_uuid(uuid):
1708 return 'MGMTCLI_%s' % uuid
1710 class ManagementClient(Client):
1711 def __init__(self, db, uuid):
1712 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1713 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1714 module_dir = 'mgmt')
1717 def __init__(self, db):
1718 Module.__init__(self, 'COBD', db)
1719 self.real_uuid = self.db.get_first_ref('realobd')
1720 self.cache_uuid = self.db.get_first_ref('cacheobd')
1721 self.add_lustre_module('cobd' , 'cobd')
1723 # need to check /proc/mounts and /etc/mtab before
1724 # formatting anything.
1725 # FIXME: check if device is already formatted.
1727 if is_prepared(self.name):
1729 self.info(self.real_uuid, self.cache_uuid)
1730 lctl.newdev("cobd", self.name, self.uuid,
1731 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1734 # virtual interface for OSC and LOV
1736 def __init__(self, db, uuid, fs_name, name_override = None):
1737 Module.__init__(self, 'VOSC', db)
1738 if db.get_class() == 'lov':
1739 self.osc = LOV(db, uuid, fs_name, name_override)
1741 self.osc = get_osc(db, uuid, fs_name)
1743 return self.osc.uuid
1745 return self.osc.name
1750 def load_module(self):
1751 self.osc.load_module()
1752 def cleanup_module(self):
1753 self.osc.cleanup_module()
1756 class ECHO_CLIENT(Module):
1757 def __init__(self,db):
1758 Module.__init__(self, 'ECHO_CLIENT', db)
1759 self.add_lustre_module('obdecho', 'obdecho')
1760 self.obd_uuid = self.db.get_first_ref('obd')
1761 obd = self.db.lookup(self.obd_uuid)
1762 self.uuid = generate_client_uuid(self.name)
1763 self.osc = VOSC(obd, self.uuid, self.name)
1766 if is_prepared(self.name):
1769 self.osc.prepare() # XXX This is so cheating. -p
1770 self.info(self.obd_uuid)
1772 lctl.newdev("echo_client", self.name, self.uuid,
1773 setup = self.osc.get_name())
1776 if is_prepared(self.name):
1777 Module.cleanup(self)
1780 def load_module(self):
1781 self.osc.load_module()
1782 Module.load_module(self)
1784 def cleanup_module(self):
1785 Module.cleanup_module(self)
1786 self.osc.cleanup_module()
1789 def generate_client_uuid(name):
1790 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1792 int(random.random() * 1048576),
1793 int(random.random() * 1048576))
1794 return client_uuid[:36]
1797 class Mountpoint(Module):
1798 def __init__(self,db):
1799 Module.__init__(self, 'MTPT', db)
1800 self.path = self.db.get_val('path')
1801 self.fs_uuid = self.db.get_first_ref('filesystem')
1802 fs = self.db.lookup(self.fs_uuid)
1803 self.mds_uuid = fs.get_first_ref('mds')
1804 self.obd_uuid = fs.get_first_ref('obd')
1805 self.mgmt_uuid = fs.get_first_ref('mgmt')
1806 obd = self.db.lookup(self.obd_uuid)
1807 client_uuid = generate_client_uuid(self.name)
1808 self.vosc = VOSC(obd, client_uuid, self.name)
1809 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1811 self.add_lustre_module('mdc', 'mdc')
1812 self.add_lustre_module('llite', 'llite')
1814 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1820 if fs_is_mounted(self.path):
1821 log(self.path, "already mounted.")
1825 self.mgmtcli.prepare()
1828 mdc_name = self.mdc.name
1830 self.info(self.path, self.mds_uuid, self.obd_uuid)
1831 if config.record or config.lctl_dump:
1832 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1834 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1835 (self.vosc.get_name(), mdc_name, config.config, self.path)
1836 run("mkdir", self.path)
1841 panic("mount failed:", self.path, ":", string.join(val))
1844 self.info(self.path, self.mds_uuid,self.obd_uuid)
1846 if config.record or config.lctl_dump:
1847 lctl.del_mount_option(local_node_name)
1849 if fs_is_mounted(self.path):
1851 (rc, out) = run("umount", "-f", self.path)
1853 (rc, out) = run("umount", self.path)
1855 raise CommandError('umount', out, rc)
1857 if fs_is_mounted(self.path):
1858 panic("fs is still mounted:", self.path)
1863 self.mgmtcli.cleanup()
1865 def load_module(self):
1867 self.mgmtcli.load_module()
1868 self.vosc.load_module()
1869 Module.load_module(self)
1871 def cleanup_module(self):
1872 Module.cleanup_module(self)
1873 self.vosc.cleanup_module()
1875 self.mgmtcli.cleanup_module()
1878 # ============================================================
1879 # misc query functions
1881 def get_ost_net(self, osd_uuid):
1885 osd = self.lookup(osd_uuid)
1886 node_uuid = osd.get_first_ref('node')
1887 node = self.lookup(node_uuid)
1889 panic("unable to find node for osd_uuid:", osd_uuid,
1890 " node_ref:", node_uuid)
1891 for net_uuid in node.get_networks():
1892 db = node.lookup(net_uuid)
1893 srv_list.append(Network(db))
1897 # the order of iniitailization is based on level.
1898 def getServiceLevel(self):
1899 type = self.get_class()
1901 if type in ('network',):
1903 elif type in ('routetbl',):
1905 elif type in ('ldlm',):
1907 elif type in ('mgmt',):
1909 elif type in ('osd', 'cobd'):
1911 elif type in ('mdsdev',):
1913 elif type in ('mountpoint', 'echoclient'):
1916 panic("Unknown type: ", type)
1918 if ret < config.minlevel or ret > config.maxlevel:
1923 # return list of services in a profile. list is a list of tuples
1924 # [(level, db_object),]
1925 def getServices(self):
1927 for ref_class, ref_uuid in self.get_all_refs():
1928 servdb = self.lookup(ref_uuid)
1930 level = getServiceLevel(servdb)
1932 list.append((level, servdb))
1934 panic('service not found: ' + ref_uuid)
1940 ############################################################
1942 # FIXME: clean this mess up!
1944 # OSC is no longer in the xml, so we have to fake it.
1945 # this is getting ugly and begging for another refactoring
1946 def get_osc(ost_db, uuid, fs_name):
1947 osc = OSC(ost_db, uuid, fs_name)
1950 def get_mdc(db, uuid, fs_name, mds_uuid):
1951 mds_db = db.lookup(mds_uuid);
1953 panic("no mds:", mds_uuid)
1954 mdc = MDC(mds_db, uuid, fs_name)
1957 ############################################################
1958 # routing ("rooting")
1960 # list of (nettype, cluster_id, nid)
1963 def find_local_clusters(node_db):
1964 global local_clusters
1965 for netuuid in node_db.get_networks():
1966 net = node_db.lookup(netuuid)
1968 debug("add_local", netuuid)
1969 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
1971 if acceptors.has_key(srv.port):
1972 panic("duplicate port:", srv.port)
1973 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1974 srv.send_mem, srv.recv_mem,
1977 # This node is a gateway.
1979 def node_is_router():
1982 # If there are any routers found in the config, then this will be true
1983 # and all nodes will load kptlrouter.
1985 def node_needs_router():
1986 return needs_router or is_router
1988 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1989 # Currently, these local routes are only added to kptlrouter route
1990 # table if they are needed to connect to a specific server. This
1991 # should be changed so all available routes are loaded, and the
1992 # ptlrouter can make all the decisions.
1995 def find_local_routes(lustre):
1996 """ Scan the lustre config looking for routers . Build list of
1998 global local_routes, needs_router
2000 list = lustre.lookup_class('node')
2002 if router.get_val_int('router', 0):
2004 for (local_type, local_cluster_id, local_nid) in local_clusters:
2006 for netuuid in router.get_networks():
2007 db = router.lookup(netuuid)
2008 if (local_type == db.get_val('nettype') and
2009 local_cluster_id == db.get_val('clusterid')):
2010 gw = db.get_val('nid')
2013 debug("find_local_routes: gw is", gw)
2014 for route in router.get_local_routes(local_type, gw):
2015 local_routes.append(route)
2016 debug("find_local_routes:", local_routes)
2019 def choose_local_server(srv_list):
2020 for srv in srv_list:
2021 if local_cluster(srv.net_type, srv.cluster_id):
2024 def local_cluster(net_type, cluster_id):
2025 for cluster in local_clusters:
2026 if net_type == cluster[0] and cluster_id == cluster[1]:
2030 def local_interface(net_type, cluster_id, nid):
2031 for cluster in local_clusters:
2032 if (net_type == cluster[0] and cluster_id == cluster[1]
2033 and nid == cluster[2]):
2037 def find_route(srv_list):
2039 frm_type = local_clusters[0][0]
2040 for srv in srv_list:
2041 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2042 to_type = srv.net_type
2044 cluster_id = srv.cluster_id
2045 debug ('looking for route to', to_type, to)
2046 for r in local_routes:
2047 debug("find_route: ", r)
2048 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2049 result.append((srv, r))
2052 def get_active_target(db):
2053 target_uuid = db.getUUID()
2054 target_name = db.getName()
2055 node_name = get_select(target_name)
2057 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2059 tgt_dev_uuid = db.get_first_ref('active')
2062 def get_server_by_nid_uuid(db, nid_uuid):
2063 for n in db.lookup_class("network"):
2065 if net.nid_uuid == nid_uuid:
2069 ############################################################
2073 type = db.get_class()
2074 debug('Service:', type, db.getName(), db.getUUID())
2079 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2080 elif type == 'network':
2082 elif type == 'routetbl':
2086 elif type == 'cobd':
2088 elif type == 'mdsdev':
2090 elif type == 'mountpoint':
2092 elif type == 'echoclient':
2094 elif type == 'mgmt':
2097 panic ("unknown service type:", type)
2101 # Prepare the system to run lustre using a particular profile
2102 # in a the configuration.
2103 # * load & the modules
2104 # * setup networking for the current node
2105 # * make sure partitions are in place and prepared
2106 # * initialize devices with lctl
2107 # Levels is important, and needs to be enforced.
2108 def for_each_profile(db, prof_list, operation):
2109 for prof_uuid in prof_list:
2110 prof_db = db.lookup(prof_uuid)
2112 panic("profile:", profile, "not found.")
2113 services = getServices(prof_db)
2116 def doWriteconf(services):
2120 if s[1].get_class() == 'mdsdev':
2121 n = newService(s[1])
2124 def doSetup(services):
2128 n = newService(s[1])
2131 def doModules(services):
2135 n = newService(s[1])
2138 def doCleanup(services):
2143 n = newService(s[1])
2144 if n.safe_to_clean():
2147 def doUnloadModules(services):
2152 n = newService(s[1])
2153 if n.safe_to_clean_modules():
2158 def doHost(lustreDB, hosts):
2159 global is_router, local_node_name
2162 node_db = lustreDB.lookup_name(h, 'node')
2166 print 'No host entry found.'
2169 local_node_name = node_db.get_val('name', 0)
2170 is_router = node_db.get_val_int('router', 0)
2171 lustre_upcall = node_db.get_val('lustreUpcall', '')
2172 portals_upcall = node_db.get_val('portalsUpcall', '')
2173 timeout = node_db.get_val_int('timeout', 0)
2175 find_local_clusters(node_db)
2177 find_local_routes(lustreDB)
2179 # Two step process: (1) load modules, (2) setup lustre
2180 # if not cleaning, load modules first.
2181 prof_list = node_db.get_refs('profile')
2183 if config.write_conf:
2184 for_each_profile(node_db, prof_list, doModules)
2186 for_each_profile(node_db, prof_list, doWriteconf)
2187 for_each_profile(node_db, prof_list, doUnloadModules)
2189 elif config.recover:
2190 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2191 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2192 "--client_uuid <UUID> --conn_uuid <UUID>")
2193 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2195 elif config.cleanup:
2197 # the command line can override this value
2199 # ugly hack, only need to run lctl commands for --dump
2200 if config.lctl_dump or config.record:
2201 for_each_profile(node_db, prof_list, doCleanup)
2204 sys_set_timeout(timeout)
2207 sys_set_lustre_upcall(lustre_upcall)
2208 sys_set_portals_upcall(portals_upcall)
2210 for_each_profile(node_db, prof_list, doCleanup)
2211 for_each_profile(node_db, prof_list, doUnloadModules)
2214 # ugly hack, only need to run lctl commands for --dump
2215 if config.lctl_dump or config.record:
2216 sys_set_timeout(timeout)
2217 sys_set_lustre_upcall(lustre_upcall)
2218 for_each_profile(node_db, prof_list, doSetup)
2222 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2223 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2225 for_each_profile(node_db, prof_list, doModules)
2227 sys_set_debug_path()
2230 script = config.gdb_script
2231 run(lctl.lctl, ' modules >', script)
2233 log ("The GDB module script is in", script)
2234 # pause, so user has time to break and
2237 sys_set_timeout(timeout)
2238 sys_set_lustre_upcall(lustre_upcall)
2239 sys_set_portals_upcall(portals_upcall)
2241 for_each_profile(node_db, prof_list, doSetup)
2243 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2244 tgt = db.lookup(tgt_uuid)
2246 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2247 new_uuid = get_active_target(tgt)
2249 raise Lustre.LconfError("doRecovery: no active target found for: " +
2251 net = choose_local_server(get_ost_net(db, new_uuid))
2253 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2255 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2257 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2259 lctl.disconnect(oldnet)
2260 except CommandError, e:
2261 log("recover: disconnect", nid_uuid, "failed: ")
2266 except CommandError, e:
2267 log("recover: connect failed")
2270 lctl.recover(client_uuid, net.nid_uuid)
2273 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2274 base = os.path.dirname(cmd)
2275 if development_mode():
2276 if not config.lustre:
2277 config.lustre = (os.path.join(base, ".."))
2278 # normalize the portals dir, using command line arg if set
2280 portals_dir = config.portals
2281 dir = os.path.join(config.lustre, portals_dir)
2282 config.portals = dir
2283 debug('config.portals', config.portals)
2284 elif config.lustre and config.portals:
2286 # if --lustre and --portals, normalize portals
2287 # can ignore POTRALS_DIR here, since it is probly useless here
2288 config.portals = os.path.join(config.lustre, config.portals)
2289 debug('config.portals B', config.portals)
2291 def sysctl(path, val):
2292 debug("+ sysctl", path, val)
2296 fp = open(os.path.join('/proc/sys', path), 'w')
2303 def sys_set_debug_path():
2304 sysctl('portals/debug_path', config.debug_path)
2306 def sys_set_lustre_upcall(upcall):
2307 # the command overrides the value in the node config
2308 if config.lustre_upcall:
2309 upcall = config.lustre_upcall
2311 upcall = config.upcall
2313 lctl.set_lustre_upcall(upcall)
2315 def sys_set_portals_upcall(upcall):
2316 # the command overrides the value in the node config
2317 if config.portals_upcall:
2318 upcall = config.portals_upcall
2320 upcall = config.upcall
2322 sysctl('portals/upcall', upcall)
2324 def sys_set_timeout(timeout):
2325 # the command overrides the value in the node config
2326 if config.timeout and config.timeout > 0:
2327 timeout = config.timeout
2328 if timeout != None and timeout > 0:
2329 lctl.set_timeout(timeout)
2331 def sys_tweak_socknal ():
2332 if config.single_socket:
2333 sysctl("socknal/typed", 0)
2335 def sys_optimize_elan ():
2336 run ("echo 0 > /proc/elan/config/eventint_punt_loops")
2338 def sys_set_ptldebug():
2339 if config.ptldebug != None:
2341 val = eval(config.ptldebug, ptldebug_names)
2342 val = "0x%x" % (val,)
2343 sysctl('portals/debug', val)
2344 except NameError, e:
2347 def sys_set_subsystem():
2348 if config.subsystem != None:
2350 val = eval(config.subsystem, subsystem_names)
2351 val = "0x%x" % (val,)
2352 sysctl('portals/subsystem_debug', val)
2353 except NameError, e:
2356 def sys_set_netmem_max(path, max):
2357 debug("setting", path, "to at least", max)
2365 fp = open(path, 'w')
2366 fp.write('%d\n' %(max))
2370 def sys_make_devices():
2371 if not os.access('/dev/portals', os.R_OK):
2372 run('mknod /dev/portals c 10 240')
2373 if not os.access('/dev/obd', os.R_OK):
2374 run('mknod /dev/obd c 10 241')
2377 # Add dir to the global PATH, if not already there.
2378 def add_to_path(new_dir):
2379 syspath = string.split(os.environ['PATH'], ':')
2380 if new_dir in syspath:
2382 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2384 def default_debug_path():
2385 path = '/tmp/lustre-log'
2386 if os.path.isdir('/r'):
2391 def default_gdb_script():
2392 script = '/tmp/ogdb'
2393 if os.path.isdir('/r'):
2394 return '/r' + script
2399 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2400 # ensure basic elements are in the system path
2401 def sanitise_path():
2402 for dir in DEFAULT_PATH:
2405 # global hack for the --select handling
2407 def init_select(args):
2408 # args = [service=nodeA,service2=nodeB service3=nodeC]
2411 list = string.split(arg, ',')
2413 srv, node = string.split(entry, '=')
2414 tgt_select[srv] = node
2416 def get_select(srv):
2417 if tgt_select.has_key(srv):
2418 return tgt_select[srv]
2422 FLAG = Lustre.Options.FLAG
2423 PARAM = Lustre.Options.PARAM
2424 INTPARAM = Lustre.Options.INTPARAM
2425 PARAMLIST = Lustre.Options.PARAMLIST
2427 ('verbose,v', "Print system commands as they are run"),
2428 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2429 ('config', "Cluster config name used for LDAP query", PARAM),
2430 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2431 ('node', "Load config for <nodename>", PARAM),
2432 ('cleanup,d', "Cleans up config. (Shutdown)"),
2433 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2435 ('single_socket', "socknal option: only use one socket instead of bundle",
2437 ('failover',"""Used to shut down without saving state.
2438 This will allow this node to "give up" a service to a
2439 another node for failover purposes. This will not
2440 be a clean shutdown.""",
2442 ('gdb', """Prints message after creating gdb module script
2443 and sleeps for 5 seconds."""),
2444 ('noexec,n', """Prints the commands and steps that will be run for a
2445 config without executing them. This can used to check if a
2446 config file is doing what it should be doing"""),
2447 ('nomod', "Skip load/unload module step."),
2448 ('nosetup', "Skip device setup/cleanup step."),
2449 ('reformat', "Reformat all devices (without question)"),
2450 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2451 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2453 ('write_conf', "Save all the client config information on mds."),
2454 ('record', "Write config information on mds."),
2455 ('record_log', "Name of config record log.", PARAM),
2456 ('record_device', "MDS device name that will record the config commands",
2458 ('minlevel', "Minimum level of services to configure/cleanup",
2460 ('maxlevel', """Maximum level of services to configure/cleanup
2461 Levels are aproximatly like:
2466 70 - mountpoint, echo_client, osc, mdc, lov""",
2468 ('lustre', """Base directory of lustre sources. This parameter will
2469 cause lconf to load modules from a source tree.""", PARAM),
2470 ('portals', """Portals source directory. If this is a relative path,
2471 then it is assumed to be relative to lustre. """, PARAM),
2472 ('timeout', "Set recovery timeout", INTPARAM),
2473 ('upcall', "Set both portals and lustre upcall script", PARAM),
2474 ('lustre_upcall', "Set lustre upcall script", PARAM),
2475 ('portals_upcall', "Set portals upcall script", PARAM),
2476 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2477 ('ptldebug', "Set the portals debug level", PARAM),
2478 ('subsystem', "Set the portals debug subsystem", PARAM),
2479 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2480 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2481 # Client recovery options
2482 ('recover', "Recover a device"),
2483 ('group', "The group of devices to configure or cleanup", PARAM),
2484 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2485 ('client_uuid', "The failed client (required for recovery)", PARAM),
2486 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2488 ('inactive', """The name of an inactive service, to be ignored during
2489 mounting (currently OST-only). Can be repeated.""",
2494 global lctl, config, toplevel, CONFIG_FILE
2496 # in the upcall this is set to SIG_IGN
2497 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2499 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2501 config, args = cl.parse(sys.argv[1:])
2502 except Lustre.OptionError, e:
2506 setupModulePath(sys.argv[0])
2508 host = socket.gethostname()
2510 # the PRNG is normally seeded with time(), which is not so good for starting
2511 # time-synchronized clusters
2512 input = open('/dev/urandom', 'r')
2514 print 'Unable to open /dev/urandom!'
2516 seed = input.read(32)
2522 init_select(config.select)
2525 if not os.access(args[0], os.R_OK):
2526 print 'File not found or readable:', args[0]
2529 dom = xml.dom.minidom.parse(args[0])
2531 panic("%s does not appear to be a config file." % (args[0]))
2532 sys.exit(1) # make sure to die here, even in debug mode.
2533 CONFIG_FILE = args[0]
2534 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2535 if not config.config:
2536 config.config = os.path.basename(args[0])# use full path?
2537 if config.config[-4:] == '.xml':
2538 config.config = config.config[:-4]
2539 elif config.ldapurl:
2540 if not config.config:
2541 panic("--ldapurl requires --config name")
2542 dn = "config=%s,fs=lustre" % (config.config)
2543 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2545 print 'Missing config file or ldap URL.'
2546 print 'see lconf --help for command summary'
2551 ver = db.get_version()
2553 panic("No version found in config data, please recreate.")
2554 if ver != Lustre.CONFIG_VERSION:
2555 panic("Config version", ver, "does not match lconf version",
2556 Lustre.CONFIG_VERSION)
2560 node_list.append(config.node)
2563 node_list.append(host)
2564 node_list.append('localhost')
2566 debug("configuring for host: ", node_list)
2569 config.debug_path = config.debug_path + '-' + host
2570 config.gdb_script = config.gdb_script + '-' + host
2572 lctl = LCTLInterface('lctl')
2574 if config.lctl_dump:
2575 lctl.use_save_file(config.lctl_dump)
2578 if not (config.record_device and config.record_log):
2579 panic("When recording, both --record_log and --record_device must be specified.")
2580 lctl.record(config.record_device, config.record_log)
2582 doHost(db, node_list)
2587 if __name__ == "__main__":
2590 except Lustre.LconfError, e:
2592 # traceback.print_exc(file=sys.stdout)
2594 except CommandError, e:
2598 if first_cleanup_error:
2599 sys.exit(first_cleanup_error)