3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
27 import sys, getopt, types
28 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import xml.dom.minidom
32 if sys.version[0] == '1':
33 from FCNTL import F_GETFL, F_SETFL
35 from fcntl import F_GETFL, F_SETFL
39 DEFAULT_TCPBUF = 1048576
41 # Maximum number of devices to search for.
42 # (the /dev/loop* nodes need to be created beforehand)
43 MAX_LOOP_DEVICES = 256
44 PORTALS_DIR = '@PORTALSLOC@'
46 first_cleanup_error = 0
47 def cleanup_error(rc):
48 global first_cleanup_error
49 if not first_cleanup_error:
50 first_cleanup_error = rc
54 print """usage: lconf config.xml
56 config.xml Lustre configuration in xml format.
57 --ldapurl LDAP server URL, eg. ldap://localhost
58 --config Cluster config name used for LDAP query
59 --node <nodename> Load config for <nodename>
60 --select service=nodeA,service2=nodeB U
61 -d | --cleanup Cleans up config. (Shutdown)
62 -f | --force Forced unmounting and/or obd detach during cleanup
63 -v | --verbose Print system commands as they are run
64 -h | --help Print this help
65 --gdb Prints message after creating gdb module script
66 and sleeps for 5 seconds.
67 -n | --noexec Prints the commands and steps that will be run for a
68 config without executing them. This can used to check if a
69 config file is doing what it should be doing. (Implies -v)
70 --nomod Skip load/unload module step.
71 --nosetup Skip device setup/cleanup step.
72 --reformat Reformat all devices (without question)
73 --dump <file> Dump the kernel debug log before portals is unloaded
74 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
75 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
76 Levels are aproximatly like:
83 70 - mountpoint, echo_client
84 --lustre=src_dir Base directory of lustre sources. This parameter will cause lconf
85 to load modules from a source tree.
86 --portals=src_dir Portals source directory. If this is a relative path, then it is
87 assumed to be relative to lustre.
91 --ldap server LDAP server with lustre config database
92 --makeldiff Translate xml source to LDIFF
93 This are perhaps not needed:
97 # ============================================================
98 # Config parameters, encapsulated in a class
114 self._gdb_script = '/tmp/ogdb'
115 self._debug_path = '/tmp/lustre-log'
116 self._dump_file = None
117 self._lustre_dir = ''
118 self._portals_dir = ''
122 self._recovery_upcall = ''
124 self._config_name = ''
128 def verbose(self, flag = None):
129 if flag: self._verbose = flag
132 def noexec(self, flag = None):
133 if flag: self._noexec = flag
136 def reformat(self, flag = None):
137 if flag: self._reformat = flag
138 return self._reformat
140 def cleanup(self, flag = None):
141 if flag: self._cleanup = flag
144 def gdb(self, flag = None):
145 if flag: self._gdb = flag
148 def nomod(self, flag = None):
149 if flag: self._nomod = flag
152 def nosetup(self, flag = None):
153 if flag: self._nosetup = flag
156 def force(self, flag = None):
157 if flag: self._force = flag
160 def node(self, val = None):
161 if val: self._node = val
164 def gdb_script(self):
165 if os.path.isdir('/r'):
166 return '/r' + self._gdb_script
168 return self._gdb_script
170 def debug_path(self):
171 if os.path.isdir('/r'):
172 return '/r' + self._debug_path
174 return self._debug_path
176 def dump_file(self, val = None):
177 if val: self._dump_file = val
178 return self._dump_file
179 def minlevel(self, val = None):
180 if val: self._minlevel = int(val)
181 return self._minlevel
183 def maxlevel(self, val = None):
184 if val: self._maxlevel = int(val)
185 return self._maxlevel
187 def portals_dir(self, val = None):
188 if val: self._portals_dir = val
189 return self._portals_dir
191 def lustre_dir(self, val = None):
192 if val: self._lustre_dir = val
193 return self._lustre_dir
195 def timeout(self, val = None):
196 if val: self._timeout = val
199 def recovery_upcall(self, val = None):
200 if val: self._recovery_upcall = val
201 return self._recovery_upcall
203 def ldapurl(self, val = None):
204 if val: self._ldapurl = val
207 def config_name(self, val = None):
208 if val: self._config_name = val
209 return self._config_name
211 def init_select(self, arg):
212 # arg = "service=nodeA,service2=nodeB"
213 list = string.split(arg, ',')
215 srv, node = string.split(entry, '=')
216 self._select[srv] = node
218 def select(self, srv):
219 if self._select.has_key(srv):
220 return self._select[srv]
223 def lctl_dump(self, val = None):
224 if val: self._lctl_dump = val
225 return self._lctl_dump
230 # ============================================================
231 # debugging and error funcs
233 def fixme(msg = "this feature"):
234 raise LconfError, msg + ' not implmemented yet.'
237 msg = string.join(map(str,args))
238 if not config.noexec():
239 raise LconfError(msg)
244 msg = string.join(map(str,args))
249 print string.strip(s)
253 msg = string.join(map(str,args))
256 # ============================================================
257 # locally defined exceptions
258 class CommandError (exceptions.Exception):
259 def __init__(self, cmd_name, cmd_err, rc=None):
260 self.cmd_name = cmd_name
261 self.cmd_err = cmd_err
266 if type(self.cmd_err) == types.StringType:
268 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
270 print "! %s: %s" % (self.cmd_name, self.cmd_err)
271 elif type(self.cmd_err) == types.ListType:
273 print "! %s (error %d):" % (self.cmd_name, self.rc)
275 print "! %s:" % (self.cmd_name)
276 for s in self.cmd_err:
277 print "> %s" %(string.strip(s))
281 class LconfError (exceptions.Exception):
282 def __init__(self, args):
286 # ============================================================
287 # handle daemons, like the acceptor
289 """ Manage starting and stopping a daemon. Assumes daemon manages
290 it's own pid file. """
292 def __init__(self, cmd):
298 log(self.command, "already running.")
300 self.path = find_prog(self.command)
302 panic(self.command, "not found.")
303 ret, out = runcmd(self.path +' '+ self.command_line())
305 raise CommandError(self.path, out, ret)
309 pid = self.read_pidfile()
311 log ("killing process", pid)
313 #time.sleep(1) # let daemon die
315 log("unable to kill", self.command, e)
317 log("unable to kill", self.command)
320 pid = self.read_pidfile()
330 def read_pidfile(self):
332 fp = open(self.pidfile(), 'r')
339 def clean_pidfile(self):
340 """ Remove a stale pidfile """
341 log("removing stale pidfile:", self.pidfile())
343 os.unlink(self.pidfile())
345 log(self.pidfile(), e)
347 class AcceptorHandler(DaemonHandler):
348 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg):
349 DaemonHandler.__init__(self, "acceptor")
352 self.send_mem = send_mem
353 self.recv_mem = recv_mem
355 if net_type == 'toe':
356 self.flags = self.flags + ' -N 4'
358 self.flags = self.flags + ' -i'
360 self.flags = self.flags + ' -x'
363 return "/var/run/%s-%d.pid" % (self.command, self.port)
365 def command_line(self):
366 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
370 # start the acceptors
372 for port in acceptors.keys():
373 daemon = acceptors[port]
374 if not daemon.running():
377 def stop_acceptor(port):
378 if acceptors.has_key(port):
379 daemon = acceptors[port]
384 # ============================================================
385 # handle lctl interface
388 Manage communication with lctl
391 def __init__(self, cmd):
393 Initialize close by finding the lctl binary.
395 self.lctl = find_prog(cmd)
399 debug('! lctl not found')
402 raise CommandError('lctl', "unable to find lctl binary.")
404 def use_save_file(self, file):
405 self.save_file = file
407 def set_nonblock(self, fd):
408 fl = fcntl.fcntl(fd, F_GETFL)
409 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
414 the cmds are written to stdin of lctl
415 lctl doesn't return errors when run in script mode, so
417 should modify command line to accept multiple commands, or
418 create complex command line options
422 cmds = '\n dump ' + self.save_file + cmds
424 debug("+", cmd_line, cmds)
425 if config.noexec(): return (0, [])
427 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
428 child.tochild.write(cmds + "\n")
429 child.tochild.close()
431 # From "Python Cookbook" from O'Reilly
432 outfile = child.fromchild
433 outfd = outfile.fileno()
434 self.set_nonblock(outfd)
435 errfile = child.childerr
436 errfd = errfile.fileno()
437 self.set_nonblock(errfd)
439 outdata = errdata = ''
442 ready = select.select([outfd,errfd],[],[]) # Wait for input
443 if outfd in ready[0]:
444 outchunk = outfile.read()
445 if outchunk == '': outeof = 1
446 outdata = outdata + outchunk
447 if errfd in ready[0]:
448 errchunk = errfile.read()
449 if errchunk == '': erreof = 1
450 errdata = errdata + errchunk
451 if outeof and erreof: break
452 # end of "borrowed" code
455 if os.WIFEXITED(ret):
456 rc = os.WEXITSTATUS(ret)
459 if rc or len(errdata):
460 raise CommandError(self.lctl, errdata, rc)
463 def runcmd(self, *args):
465 run lctl using the command line
467 cmd = string.join(map(str,args))
468 debug("+", self.lctl, cmd)
469 rc, out = run(self.lctl, cmd)
471 raise CommandError(self.lctl, out, rc)
475 def network(self, net, nid):
476 """ initialized network and add "self" """
477 # Idea: "mynid" could be used for all network types to add "self," and then
478 # this special case would be gone and the "self" hack would be hidden.
479 if net in ('tcp', 'toe'):
483 quit """ % (net, nid)
486 # create a new connection
487 def connect(self, srv):
488 cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
489 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump():
499 connect %s %d %s""" % (cmds, srv.net_type,
502 srv.hostaddr, srv.port, flags )
504 cmds = cmds + "\n quit"
507 # add a route to a range
508 def add_route(self, net, gw, lo, hi):
517 def del_route(self, net, gw, lo, hi):
525 # add a route to a host
526 def add_route_host(self, net, uuid, gw, tgt):
536 # add a route to a range
537 def del_route_host(self, net, uuid, gw, tgt):
543 quit """ % (net, uuid, tgt)
546 # disconnect one connection
547 def disconnect(self, net, nid, port, servuuid):
553 quit""" % (net, nid, servuuid)
557 def disconnectAll(self, net):
565 # create a new device with lctl
566 def newdev(self, attach, setup = ""):
571 quit""" % (attach, setup)
575 def cleanup(self, name, uuid):
581 quit""" % (name, ('', 'force')[config.force()])
585 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
589 lov_setconfig %s %d %d %d %s %s
590 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
594 def dump(self, dump_file):
597 quit""" % (dump_file)
600 # get list of devices
601 def device_list(self):
602 rc, out = self.runcmd('device_list')
606 def lustre_version(self):
607 rc, out = self.runcmd('version')
610 # ============================================================
611 # Various system-level functions
612 # (ideally moved to their own module)
614 # Run a command and return the output and status.
615 # stderr is sent to /dev/null, could use popen3 to
616 # save it if necessary
619 if config.noexec(): return (0, [])
620 f = os.popen(cmd + ' 2>&1')
630 cmd = string.join(map(str,args))
633 # Run a command in the background.
634 def run_daemon(*args):
635 cmd = string.join(map(str,args))
637 if config.noexec(): return 0
638 f = os.popen(cmd + ' 2>&1')
646 # Determine full path to use for an external command
647 # searches dirname(argv[0]) first, then PATH
649 syspath = string.split(os.environ['PATH'], ':')
650 cmdpath = os.path.dirname(sys.argv[0])
651 syspath.insert(0, cmdpath);
652 if config.portals_dir():
653 syspath.insert(0, os.path.join(config.portals_dir()+'/linux/utils/'))
655 prog = os.path.join(d,cmd)
656 if os.access(prog, os.X_OK):
660 # Recursively look for file starting at base dir
661 def do_find_file(base, mod):
662 fullname = os.path.join(base, mod)
663 if os.access(fullname, os.R_OK):
665 for d in os.listdir(base):
666 dir = os.path.join(base,d)
667 if os.path.isdir(dir):
668 module = do_find_file(dir, mod)
672 def find_module(src_dir, dev_dir, modname):
673 mod = '%s.o' % (modname)
674 module = src_dir +'/'+ dev_dir +'/'+ mod
676 if os.access(module, os.R_OK):
682 # is the path a block device?
689 return stat.S_ISBLK(s[stat.ST_MODE])
691 # build fs according to type
693 def mkfs(dev, devsize, fstype):
696 # devsize is in 1k, and fs block count is in 4k
697 block_cnt = devsize/4
699 if(fstype in ('ext3', 'extN')):
700 mkfs = 'mkfs.ext2 -j -b 4096 -F '
701 elif (fstype == 'reiserfs'):
702 mkfs = 'mkreiserfs -ff'
704 print 'unsupported fs type: ', fstype
706 (ret, out) = run (mkfs, dev, block_cnt)
708 panic("Unable to build fs:", dev)
709 # enable hash tree indexing on fsswe
710 # FIXME: this check can probably go away on 2.5
712 htree = 'echo "feature FEATURE_C5" | debugfs -w'
713 (ret, out) = run (htree, dev)
715 panic("Unable to enable htree:", dev)
717 # some systems use /dev/loopN, some /dev/loop/N
721 if not os.access(loop + str(0), os.R_OK):
723 if not os.access(loop + str(0), os.R_OK):
724 panic ("can't access loop devices")
727 # find loop device assigned to thefile
730 for n in xrange(0, MAX_LOOP_DEVICES):
732 if os.access(dev, os.R_OK):
733 (stat, out) = run('losetup', dev)
734 if (out and stat == 0):
735 m = re.search(r'\((.*)\)', out[0])
736 if m and file == m.group(1):
742 # create file if necessary and assign the first free loop device
743 def init_loop(file, size, fstype):
744 dev = find_loop(file)
746 print 'WARNING file:', file, 'already mapped to', dev
748 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
750 panic(file, "size must be larger than 8MB, currently set to:", size)
751 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
754 panic("Unable to create backing store:", file)
757 # find next free loop
758 for n in xrange(0, MAX_LOOP_DEVICES):
760 if os.access(dev, os.R_OK):
761 (stat, out) = run('losetup', dev)
763 run('losetup', dev, file)
766 print "out of loop devices"
768 print "out of loop devices"
771 # undo loop assignment
772 def clean_loop(file):
773 dev = find_loop(file)
775 ret, out = run('losetup -d', dev)
777 log('unable to clean loop device:', dev, 'for file:', file)
780 # determine if dev is formatted as a <fstype> filesystem
781 def need_format(fstype, dev):
782 # FIXME don't know how to implement this
785 # initialize a block device if needed
786 def block_dev(dev, size, fstype, format):
787 if config.noexec(): return dev
788 if not is_block(dev):
789 dev = init_loop(dev, size, fstype)
790 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
791 mkfs(dev, size, fstype)
794 # panic("device:", dev,
795 # "not prepared, and autoformat is not set.\n",
796 # "Rerun with --reformat option to format ALL filesystems")
801 """lookup IP address for an interface"""
802 rc, out = run("/sbin/ifconfig", iface)
805 addr = string.split(out[1])[1]
806 ip = string.split(addr, ':')[1]
809 def get_local_nid(net_type, wildcard):
810 """Return the local nid. First look for an elan interface,
811 then use the local address. """
813 if os.access('/proc/elan/device0/position', os.R_OK):
814 local = get_local_address('elan', '*')
816 local = get_local_address(net_type, wildcard)
819 def get_local_address(net_type, wildcard):
820 """Return the local address for the network type."""
822 if net_type in ('tcp', 'toe'):
824 iface, star = string.split(wildcard, ':')
825 local = if2addr(iface)
827 panic ("unable to determine ip for:", wildcard)
829 host = socket.gethostname()
830 local = socket.gethostbyname(host)
831 elif net_type == 'elan':
832 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
834 fp = open('/proc/elan/device0/position', 'r')
835 lines = fp.readlines()
844 elif net_type == 'gm':
845 fixme("automatic local address for GM")
849 def is_prepared(uuid):
850 """Return true if a device exists for the uuid"""
851 # expect this format:
852 # 1 UP ldlm ldlm ldlm_UUID 2
853 if config.lctl_dump():
856 out = lctl.device_list()
858 if uuid == string.split(s)[4]:
860 except CommandError, e:
864 def is_network_prepared():
865 """If the PTLRPC device exists, then assumet that all networking
866 has been configured"""
867 if config.lctl_dump():
870 out = lctl.device_list()
872 if 'RPCDEV_UUID' == string.split(s)[4]:
874 except CommandError, e:
879 def fs_is_mounted(path):
880 """Return true if path is a mounted lustre filesystem"""
882 fp = open('/proc/mounts')
883 lines = fp.readlines()
887 if a[1] == path and a[2] == 'lustre_lite':
894 # ============================================================
895 # Classes to prepare and cleanup the various objects
898 """ Base class for the rest of the modules. The default cleanup method is
899 defined here, as well as some utilitiy funcs.
901 def __init__(self, module_name, db):
903 self.module_name = module_name
904 self.name = self.db.getName()
905 self.uuid = self.db.getUUID()
906 self.kmodule_list = []
910 def info(self, *args):
911 msg = string.join(map(str,args))
912 print self.module_name + ":", self.name, self.uuid, msg
915 """ default cleanup, used for most modules """
918 lctl.cleanup(self.name, self.uuid)
919 except CommandError, e:
920 log(self.module_name, "cleanup failed: ", self.name)
924 def add_portals_module(self, dev_dir, modname):
925 """Append a module to list of modules to load."""
926 self.kmodule_list.append((config.portals_dir(), dev_dir, modname))
928 def add_lustre_module(self, dev_dir, modname):
929 """Append a module to list of modules to load."""
930 self.kmodule_list.append((config.lustre_dir(), dev_dir, modname))
932 def mod_loaded(self, modname):
933 """Check if a module is already loaded. Look in /proc/modules for it."""
934 fp = open('/proc/modules')
935 lines = fp.readlines()
937 # please forgive my tired fingers for this one
938 ret = filter(lambda word, mod=modname: word == mod,
939 map(lambda line: string.split(line)[0], lines))
942 def load_module(self):
943 """Load all the modules in the list in the order they appear."""
944 for src_dir, dev_dir, mod in self.kmodule_list:
945 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
946 if self.mod_loaded(mod) and not config.noexec():
948 log ('loading module:', mod)
950 module = find_module(src_dir, dev_dir, mod)
952 panic('module not found:', mod)
953 (rc, out) = run('/sbin/insmod', module)
955 raise CommandError('insmod', out, rc)
957 (rc, out) = run('/sbin/modprobe', mod)
959 raise CommandError('modprobe', out, rc)
961 def cleanup_module(self):
962 """Unload the modules in the list in reverse order."""
963 rev = self.kmodule_list
965 for src_dir, dev_dir, mod in rev:
966 if not self.mod_loaded(mod):
969 if mod == 'portals' and config.dump_file():
970 lctl.dump(config.dump_file())
971 log('unloading module:', mod)
974 (rc, out) = run('/sbin/rmmod', mod)
976 log('! unable to unload module:', mod)
979 class Network(Module):
980 def __init__(self,db):
981 Module.__init__(self, 'NETWORK', db)
982 self.net_type = self.db.get_val('nettype')
983 self.nid = self.db.get_val('nid', '*')
984 self.port = self.db.get_val_int('port', 0)
985 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
986 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
987 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
988 self.nid_exchange = self.db.get_val_int('nidexchange', 0)
991 self.nid = get_local_nid(self.net_type, self.nid)
993 panic("unable to set nid for", self.net_type, self.nid)
994 debug("nid:", self.nid)
996 self.hostaddr = self.db.get_val('hostaddr', self.nid)
997 if '*' in self.hostaddr:
998 self.hostaddr = get_local_address(self.net_type, self.hostaddr)
1000 panic("unable to set nid for", self.net_type, self.hostaddr)
1001 debug("hostaddr:", self.hostaddr)
1002 # debug ( "hostaddr ", self.hostaddr, "net_type", self.net_type)
1004 self.add_portals_module("linux/oslib", 'portals')
1005 if node_needs_router():
1006 self.add_portals_module("linux/router", 'kptlrouter')
1007 if self.net_type == 'tcp':
1008 self.add_portals_module("linux/socknal", 'ksocknal')
1009 if self.net_type == 'toe':
1010 self.add_portals_module("/linux/toenal", 'ktoenal')
1011 if self.net_type == 'elan':
1012 self.add_portals_module("/linux/rqswnal", 'kqswnal')
1013 if self.net_type == 'gm':
1014 self.add_portals_module("/linux/gmnal", 'kgmnal')
1015 self.add_lustre_module('obdclass', 'obdclass')
1018 if is_network_prepared():
1020 self.info(self.net_type, self.nid, self.port)
1021 lctl.network(self.net_type, self.nid)
1024 self.info(self.net_type, self.nid, self.port)
1025 if self.net_type in ('tcp', 'toe'):
1026 stop_acceptor(self.port)
1028 lctl.disconnectAll(self.net_type)
1029 except CommandError, e:
1030 print "disconnectAll failed: ", self.name
1034 class Router(Module):
1035 def __init__(self,db):
1036 Module.__init__(self, 'ROUTER', db)
1038 if is_network_prepared():
1041 for net_type, gw, lo, hi in self.db.get_route_tbl():
1042 lctl.add_route(net_type, gw, lo, hi)
1043 if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
1044 srvdb = self.db.nid2server(lo, net_type)
1047 panic("no server for nid", lo)
1049 srv = Network(srvdb)
1052 for net_type, gw, lo, hi in self.db.get_route_tbl():
1053 if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
1054 srvdb = self.db.nid2server(lo, net_type)
1056 panic("no server for nid", lo)
1058 srv = Network(srvdb)
1060 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
1061 except CommandError, e:
1062 print "disconnect failed: ", self.name
1066 lctl.del_route(net_type, gw, lo, hi)
1067 except CommandError, e:
1068 print "del_route failed: ", self.name
1073 def __init__(self,db):
1074 Module.__init__(self, 'LDLM', db)
1075 self.add_lustre_module('ldlm', 'ldlm')
1077 if is_prepared(self.uuid):
1080 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid))
1082 if is_prepared(self.uuid):
1083 Module.cleanup(self)
1085 class PTLRPC(Module):
1086 def __init__(self,db):
1087 Module.__init__(self, 'PTLRPC', db)
1088 self.add_lustre_module('ptlrpc', 'ptlrpc')
1090 if is_prepared(self.uuid):
1093 lctl.newdev(attach="ptlrpc %s %s" % (self.name, self.uuid))
1095 if is_prepared(self.uuid):
1096 Module.cleanup(self)
1099 def __init__(self,db):
1100 Module.__init__(self, 'LOV', db)
1101 self.add_lustre_module('mdc', 'mdc')
1102 self.add_lustre_module('lov', 'lov')
1103 self.mds_uuid = self.db.get_first_ref('mds')
1104 mds= self.db.lookup(self.mds_uuid)
1105 self.mds_name = mds.getName()
1106 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1107 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1108 self.pattern = self.db.get_val_int('stripepattern', 0)
1109 self.devlist = self.db.get_refs('obd')
1110 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1113 for obd_uuid in self.devlist:
1114 obd = self.db.lookup(obd_uuid)
1115 osc = get_osc(obd, self.name)
1117 self.osclist.append(osc)
1119 panic('osc not found:', obd_uuid)
1122 if is_prepared(self.uuid):
1124 for osc in self.osclist:
1126 # Ignore connection failures, because the LOV will DTRT with
1127 # an unconnected OSC.
1128 osc.prepare(ignore_connect_failure=1)
1129 except CommandError:
1130 print "Error preparing OSC %s (inactive)\n" % osc.uuid
1131 self.mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
1132 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1133 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1134 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
1135 setup ="%s" % (self.mdc_uuid))
1138 if is_prepared(self.uuid):
1139 Module.cleanup(self)
1140 for osc in self.osclist:
1142 cleanup_mdc(self.db, self.name, self.mds_uuid)
1144 def load_module(self):
1145 for osc in self.osclist:
1148 Module.load_module(self)
1150 def cleanup_module(self):
1151 Module.cleanup_module(self)
1152 for osc in self.osclist:
1153 osc.cleanup_module()
1156 class LOVConfig(Module):
1157 def __init__(self,db):
1158 Module.__init__(self, 'LOVConfig', db)
1160 self.lov_uuid = self.db.get_first_ref('lov')
1161 l = self.db.lookup(self.lov_uuid)
1166 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
1167 lov.pattern, lov.devlist, lov.mds_name)
1168 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
1169 lov.stripe_sz, lov.stripe_off, lov.pattern,
1170 string.join(lov.devlist))
1176 class MDSDEV(Module):
1177 def __init__(self,db):
1178 Module.__init__(self, 'MDSDEV', db)
1179 self.devpath = self.db.get_val('devpath','')
1180 self.size = self.db.get_val_int('devsize', 0)
1181 self.fstype = self.db.get_val('fstype', '')
1182 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1183 target_uuid = self.db.get_first_ref('target')
1184 mds = self.db.lookup(target_uuid)
1185 self.name = mds.getName()
1186 self.lovconfig_uuids = mds.get_refs('lovconfig')
1187 # FIXME: if fstype not set, then determine based on kernel version
1188 self.format = self.db.get_val('autoformat', "no")
1190 active_uuid = mds.get_active_target()
1192 panic("No target device found:", target_uuid)
1193 if active_uuid == self.uuid:
1197 self.target_dev_uuid = self.uuid
1198 self.uuid = target_uuid
1200 if self.fstype == 'extN':
1201 self.add_lustre_module('extN', 'extN')
1202 self.add_lustre_module('mds', 'mds')
1204 self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
1206 def load_module(self):
1208 Module.load_module(self)
1211 if is_prepared(self.uuid):
1214 debug(self.uuid, "not active")
1216 self.info(self.devpath, self.fstype, self.format)
1218 blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
1219 if not is_prepared('MDT_UUID'):
1220 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
1222 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
1223 setup ="%s %s" %(blkdev, self.fstype))
1224 for uuid in self.lovconfig_uuids:
1225 db = self.db.lookup(uuid)
1226 lovconfig = LOVConfig(db)
1230 if is_prepared('MDT_UUID'):
1232 lctl.cleanup("MDT", "MDT_UUID")
1233 except CommandError, e:
1234 print "cleanup failed: ", self.name
1237 if is_prepared(self.uuid):
1238 Module.cleanup(self)
1239 clean_loop(self.devpath)
1242 def __init__(self, db):
1243 Module.__init__(self, 'OSD', db)
1244 self.osdtype = self.db.get_val('osdtype')
1245 self.devpath = self.db.get_val('devpath', '')
1246 self.size = self.db.get_val_int('devsize', 0)
1247 self.fstype = self.db.get_val('fstype', '')
1248 target_uuid = self.db.get_first_ref('target')
1249 ost = self.db.lookup(target_uuid)
1250 self.name = ost.getName()
1251 # FIXME: if fstype not set, then determine based on kernel version
1252 self.format = self.db.get_val('autoformat', 'yes')
1253 if self.fstype == 'extN':
1254 self.add_lustre_module('extN', 'extN')
1256 active_uuid = ost.get_active_target()
1258 panic("No target device found:", target_uuid)
1259 if active_uuid == self.uuid:
1263 self.target_dev_uuid = self.uuid
1264 self.uuid = target_uuid
1266 self.add_lustre_module('ost', 'ost')
1267 self.add_lustre_module(self.osdtype, self.osdtype)
1269 self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
1271 def load_module(self):
1273 Module.load_module(self)
1275 # need to check /proc/mounts and /etc/mtab before
1276 # formatting anything.
1277 # FIXME: check if device is already formatted.
1279 if is_prepared(self.uuid):
1282 debug(self.uuid, "not active")
1284 self.info(self.osdtype, self.devpath, self.size, self.fstype, self.format)
1286 if self.osdtype == 'obdecho':
1289 blkdev = block_dev(self.devpath, self.size, self.fstype, self.format)
1290 lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
1291 setup ="%s %s" %(blkdev, self.fstype))
1292 if not is_prepared('OSS_UUID'):
1293 lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
1297 if is_prepared('OSS_UUID'):
1299 lctl.cleanup("OSS", "OSS_UUID")
1300 except CommandError, e:
1301 print "cleanup failed: ", self.name
1304 if is_prepared(self.uuid):
1305 Module.cleanup(self)
1306 if not self.osdtype == 'obdecho':
1307 clean_loop(self.devpath)
1309 # Generic client module, used by OSC and MDC
1310 class Client(Module):
1311 def __init__(self, tgtdb, module, owner):
1312 self.target_name = tgtdb.getName()
1313 self.target_uuid = tgtdb.getUUID()
1316 self.tgt_dev_uuid = tgtdb.get_active_target()
1317 if not self.tgt_dev_uuid:
1318 panic("No target device found for target:", self.target_name)
1320 self.kmodule_list = []
1324 self.module = module
1325 self.module_name = string.upper(module)
1326 self.name = '%s_%s_%s' % (self.module_name, owner, self.target_name)
1327 self.uuid = '%05x%05x_%.14s_%05x%05x' % (int(random.random() * 1048576),
1328 int(random.random() * 1048576),self.name,
1329 int(random.random() * 1048576),
1330 int(random.random() * 1048576))
1331 self.uuid = self.uuid[0:36]
1332 self.lookup_server(self.tgt_dev_uuid)
1333 self.add_lustre_module(module, module)
1335 def lookup_server(self, srv_uuid):
1336 """ Lookup a server's network information """
1337 self._server_nets = self.db.get_ost_net(srv_uuid)
1338 if len(self._server_nets) == 0:
1339 panic ("Unable to find a server for:", srv_uuid)
1341 def get_servers(self):
1342 return self._server_nets
1344 def prepare(self, ignore_connect_failure = 0):
1345 if is_prepared(self.uuid):
1347 self.info(self.target_uuid)
1349 srv = local_net(self.get_servers())
1353 srv, r = find_route(self.get_servers())
1355 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1357 panic ("no route to", self.target_uuid)
1358 except CommandError:
1359 if (ignore_connect_failure == 0):
1362 lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
1363 setup ="%s %s" %(self.target_uuid, srv.uuid))
1366 Module.cleanup(self)
1367 srv = local_net(self.get_servers())
1370 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
1371 except CommandError, e:
1372 log(self.module_name, "disconnect failed: ", self.name)
1376 self.info(self.target_uuid)
1377 srv, r = find_route(self.get_servers())
1380 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1381 except CommandError, e:
1382 print "del_route failed: ", self.name
1389 def __init__(self, db, owner):
1390 Client.__init__(self, db, 'mdc', owner)
1393 def __init__(self, db, owner):
1394 Client.__init__(self, db, 'osc', owner)
1398 def __init__(self, db):
1399 Module.__init__(self, 'COBD', db)
1400 self.real_uuid = self.db.get_first_ref('realobd')
1401 self.cache_uuid = self.db.get_first_ref('cacheobd')
1402 self.add_lustre_module('cobd' , 'cobd')
1404 # need to check /proc/mounts and /etc/mtab before
1405 # formatting anything.
1406 # FIXME: check if device is already formatted.
1408 if is_prepared(self.uuid):
1410 self.info(self.real_uuid, self.cache_uuid)
1411 lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid),
1412 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1415 # virtual interface for OSC and LOV
1417 def __init__(self,db, owner):
1418 Module.__init__(self, 'VOSC', db)
1419 if db.get_class() == 'lov':
1422 self.osc = get_osc(db, owner)
1424 return self.osc.uuid
1429 def load_module(self):
1430 self.osc.load_module()
1431 def cleanup_module(self):
1432 self.osc.cleanup_module()
1434 return self.db.get_class() != 'lov'
1435 def get_mdc_uuid(self):
1436 if self.db.get_class() == 'lov':
1437 return self.osc.mdc_uuid
1441 class ECHO_CLIENT(Module):
1442 def __init__(self,db):
1443 Module.__init__(self, 'ECHO_CLIENT', db)
1444 self.add_lustre_module('obdecho', 'obdecho')
1445 self.obd_uuid = self.db.get_first_ref('obd')
1446 obd = self.db.lookup(self.obd_uuid)
1447 self.osc = VOSC(obd, self.name)
1450 if is_prepared(self.uuid):
1452 self.osc.prepare() # XXX This is so cheating. -p
1453 self.info(self.obd_uuid)
1455 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1456 setup = self.osc.get_uuid())
1459 if is_prepared(self.uuid):
1460 Module.cleanup(self)
1463 def load_module(self):
1464 self.osc.load_module()
1465 Module.load_module(self)
1466 def cleanup_module(self):
1467 Module.cleanup_module(self)
1468 self.osc.cleanup_module()
1471 class Mountpoint(Module):
1472 def __init__(self,db):
1473 Module.__init__(self, 'MTPT', db)
1474 self.path = self.db.get_val('path')
1475 self.mds_uuid = self.db.get_first_ref('mds')
1476 self.obd_uuid = self.db.get_first_ref('obd')
1477 obd = self.db.lookup(self.obd_uuid)
1478 self.vosc = VOSC(obd, self.name)
1479 if self.vosc.need_mdc():
1480 self.add_lustre_module('mdc', 'mdc')
1481 self.add_lustre_module('llite', 'llite')
1486 if self.vosc.need_mdc():
1487 mdc_uuid = prepare_mdc(self.db, self.name, self.mds_uuid)
1489 mdc_uuid = self.vosc.get_mdc_uuid()
1491 panic("Unable to determine MDC UUID. Probably need to cleanup before re-mounting.")
1492 self.info(self.path, self.mds_uuid, self.obd_uuid)
1493 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1494 (self.vosc.get_uuid(), mdc_uuid, self.path)
1495 run("mkdir", self.path)
1498 panic("mount failed:", self.path)
1501 self.info(self.path, self.mds_uuid,self.obd_uuid)
1502 if fs_is_mounted(self.path):
1504 (rc, out) = run("umount", "-f", self.path)
1506 (rc, out) = run("umount", self.path)
1508 raise CommandError('umount', out, rc)
1510 if fs_is_mounted(self.path):
1511 panic("fs is still mounted:", self.path)
1514 if self.vosc.need_mdc():
1515 cleanup_mdc(self.db, self.name, self.mds_uuid)
1517 def load_module(self):
1518 self.vosc.load_module()
1519 Module.load_module(self)
1520 def cleanup_module(self):
1521 Module.cleanup_module(self)
1522 self.vosc.cleanup_module()
1525 # ============================================================
1526 # XML processing and query
1529 def lookup(self, uuid):
1530 """ lookup returns a new LustreDB instance"""
1531 return self._lookup_by_uuid(uuid)
1533 def lookup_name(self, name, class_name = ""):
1534 """ lookup returns a new LustreDB instance"""
1535 return self._lookup_by_name(name, class_name)
1537 def lookup_class(self, class_name):
1538 """ lookup returns a new LustreDB instance"""
1539 return self._lookup_by_class(class_name)
1541 def get_val(self, tag, default=None):
1542 v = self._get_val(tag)
1547 debug("LustreDB", self.getName(), " no value for:", tag)
1550 def get_class(self):
1551 return self._get_class()
1553 def get_val_int(self, tag, default=0):
1554 str = self._get_val(tag)
1560 panic("text value is not integer:", str)
1562 def get_first_ref(self, tag):
1563 """ Get the first uuidref of the type TAG. Only
1564 one is expected. Returns the uuid."""
1565 uuids = self._get_refs(tag)
1570 def get_refs(self, tag):
1571 """ Get all the refs of type TAG. Returns list of uuids. """
1572 uuids = self._get_refs(tag)
1575 def get_all_refs(self):
1576 """ Get all the refs. Returns list of uuids. """
1577 uuids = self._get_all_refs()
1580 def get_ost_net(self, osd_uuid):
1584 osd = self.lookup(osd_uuid)
1585 node_uuid = osd.get_first_ref('node')
1586 node = self.lookup(node_uuid)
1588 panic("unable to find node for osd_uuid:", osd_uuid,
1589 " node_ref:", node_uuid)
1590 for net_uuid in node.get_networks():
1591 db = node.lookup(net_uuid)
1592 srv_list.append(Network(db))
1595 def nid2server(self, nid, net_type):
1596 netlist = self.lookup_class('network')
1597 for net_db in netlist:
1598 if net_db.get_val('nid') == nid and net_db.get_val('nettype') == net_type:
1602 # the tag name is the service type
1603 # fixme: this should do some checks to make sure the dom_node is a service
1605 # determine what "level" a particular node is at.
1607 # the order of iniitailization is based on level.
1608 def getServiceLevel(self):
1609 type = self.get_class()
1611 if type in ('network',):
1613 elif type in ('routetbl',):
1615 elif type in ('ptlrpc',):
1617 elif type in ('device', 'ldlm'):
1619 elif type in ('osd', 'mdd', 'cobd'):
1621 elif type in ('mdsdev','ost'):
1623 elif type in ('mdc','osc'):
1625 elif type in ('lov',):
1627 elif type in ('mountpoint', 'echoclient'):
1630 if ret < config.minlevel() or ret > config.maxlevel():
1635 # return list of services in a profile. list is a list of tuples
1636 # [(level, db_object),]
1637 def getServices(self):
1639 for ref_class, ref_uuid in self.get_all_refs():
1640 servdb = self.lookup(ref_uuid)
1642 level = servdb.getServiceLevel()
1644 list.append((level, servdb))
1646 panic('service not found: ' + ref_uuid)
1651 # Find the target_device for target on a node
1652 # node->profiles->device_refs->target
1653 def get_target_device(self, target_uuid, node_name):
1654 node_db = self.lookup_name(node_name)
1657 prof_list = node_db.get_refs('profile')
1658 for prof_uuid in prof_list:
1659 prof_db = node_db.lookup(prof_uuid)
1660 ref_list = prof_db.get_all_refs()
1661 for ref in ref_list:
1662 dev = self.lookup(ref[1])
1663 if dev and dev.get_first_ref('target') == target_uuid:
1667 def get_active_target(self):
1668 target_uuid = self.getUUID()
1669 target_name = self.getName()
1670 node_name = config.select(target_name)
1672 tgt_dev_uuid = self.get_target_device(target_uuid, node_name)
1674 tgt_dev_uuid = self.get_first_ref('active')
1678 # get all network uuids for this node
1679 def get_networks(self):
1681 prof_list = self.get_refs('profile')
1682 for prof_uuid in prof_list:
1683 prof_db = self.lookup(prof_uuid)
1684 net_list = prof_db.get_refs('network')
1685 #debug("get_networks():", prof_uuid, net_list)
1686 for net_uuid in net_list:
1687 ret.append(net_uuid)
1690 class LustreDB_XML(LustreDB):
1691 def __init__(self, dom, root_node):
1694 self.root_node = root_node
1696 def xmltext(self, dom_node, tag):
1697 list = dom_node.getElementsByTagName(tag)
1700 dom_node.normalize()
1701 if dom_node.firstChild:
1702 txt = string.strip(dom_node.firstChild.data)
1706 def xmlattr(self, dom_node, attr):
1707 return dom_node.getAttribute(attr)
1709 def _get_val(self, tag):
1710 """a value could be an attribute of the current node
1711 or the text value in a child node"""
1712 ret = self.xmlattr(self.dom_node, tag)
1714 ret = self.xmltext(self.dom_node, tag)
1717 def _get_class(self):
1718 return self.dom_node.nodeName
1721 # [(ref_class, ref_uuid),]
1722 def _get_all_refs(self):
1724 for n in self.dom_node.childNodes:
1725 if n.nodeType == n.ELEMENT_NODE:
1726 ref_uuid = self.xml_get_ref(n)
1727 ref_class = n.nodeName
1728 list.append((ref_class, ref_uuid))
1733 def _get_refs(self, tag):
1734 """ Get all the refs of type TAG. Returns list of uuids. """
1736 refname = '%s_ref' % tag
1737 reflist = self.dom_node.getElementsByTagName(refname)
1739 uuids.append(self.xml_get_ref(r))
1742 def xmllookup_by_uuid(self, dom_node, uuid):
1743 for n in dom_node.childNodes:
1744 if n.nodeType == n.ELEMENT_NODE:
1745 if self.xml_get_uuid(n) == uuid:
1748 n = self.xmllookup_by_uuid(n, uuid)
1752 def _lookup_by_uuid(self, uuid):
1753 dom = self. xmllookup_by_uuid(self.root_node, uuid)
1755 return LustreDB_XML(dom, self.root_node)
1757 def xmllookup_by_name(self, dom_node, name):
1758 for n in dom_node.childNodes:
1759 if n.nodeType == n.ELEMENT_NODE:
1760 if self.xml_get_name(n) == name:
1763 n = self.xmllookup_by_name(n, name)
1767 def _lookup_by_name(self, name, class_name):
1768 dom = self.xmllookup_by_name(self.root_node, name)
1770 return LustreDB_XML(dom, self.root_node)
1772 def xmllookup_by_class(self, dom_node, class_name):
1773 return dom_node.getElementsByTagName(class_name)
1775 def _lookup_by_class(self, class_name):
1777 domlist = self.xmllookup_by_class(self.root_node, class_name)
1778 for node in domlist:
1779 ret.append(LustreDB_XML(node, self.root_node))
1782 def xml_get_name(self, n):
1783 return n.getAttribute('name')
1786 return self.xml_get_name(self.dom_node)
1788 def xml_get_ref(self, n):
1789 return n.getAttribute('uuidref')
1791 def xml_get_uuid(self, dom_node):
1792 return dom_node.getAttribute('uuid')
1795 return self.xml_get_uuid(self.dom_node)
1797 def get_routes(self, type, gw):
1798 """ Return the routes as a list of tuples of the form:
1799 [(type, gw, lo, hi),]"""
1801 tbl = self.dom_node.getElementsByTagName('routetbl')
1803 routes = t.getElementsByTagName('route')
1805 net_type = self.xmlattr(r, 'type')
1806 if type != net_type:
1807 lo = self.xmlattr(r, 'lo')
1808 hi = self.xmlattr(r, 'hi')
1809 res.append((type, gw, lo, hi))
1812 def get_route_tbl(self):
1814 for r in self.dom_node.getElementsByTagName('route'):
1815 net_type = self.xmlattr(r, 'type')
1816 gw = self.xmlattr(r, 'gw')
1817 lo = self.xmlattr(r, 'lo')
1818 hi = self.xmlattr(r, 'hi')
1819 ret.append((net_type, gw, lo, hi))
1823 # ================================================================
1825 class LustreDB_LDAP(LustreDB):
1826 def __init__(self, name, attrs,
1829 url = "ldap://localhost",
1830 user = "cn=Manager, fs=lustre",
1836 self._parent = parent
1842 self._base = parent._base
1849 self.l = ldap.initialize(self._url)
1850 # Set LDAP protocol version used
1851 self.l.protocol_version=ldap.VERSION3
1852 # user and pw only needed if modifying db
1853 self.l.bind_s("", "", ldap.AUTH_SIMPLE);
1854 except ldap.LDAPError, e:
1856 # FIXME, do something useful here
1861 def ldap_search(self, filter):
1862 """Return list of uuids matching the filter."""
1868 for name, attrs in self.l.search_s(dn, ldap.SCOPE_ONELEVEL,
1870 for v in attrs['uuid']:
1872 except ldap.NO_SUCH_OBJECT, e:
1874 except ldap.LDAPError, e:
1875 print e # FIXME: die here?
1878 ret.append(self._lookup_by_uuid(uuid))
1881 def _lookup_by_name(self, name, class_name):
1882 list = self.ldap_search("lustreName=%s" %(name))
1887 def _lookup_by_class(self, class_name):
1888 return self.ldap_search("objectclass=%s" %(string.upper(class_name)))
1890 def _lookup_by_uuid(self, uuid):
1892 dn = "uuid=%s,%s" % (uuid, self._base)
1895 for name, attrs in self.l.search_s(dn, ldap.SCOPE_BASE,
1897 ret = LustreDB_LDAP(name, attrs, parent = self)
1899 except ldap.NO_SUCH_OBJECT, e:
1900 debug("NO_SUCH_OBJECT:", uuid)
1901 pass # just return empty list
1902 except ldap.LDAPError, e:
1903 print e # FIXME: die here?
1907 def _get_val(self, k):
1909 if self._attrs.has_key(k):
1911 if type(v) == types.ListType:
1917 def _get_class(self):
1918 return string.lower(self._attrs['objectClass'][0])
1921 # [(ref_class, ref_uuid),]
1922 def _get_all_refs(self):
1924 for k in self._attrs.keys():
1925 if re.search('.*Ref', k):
1926 for uuid in self._attrs[k]:
1927 list.append((k, uuid))
1930 def _get_refs(self, tag):
1931 """ Get all the refs of type TAG. Returns list of uuids. """
1933 refname = '%sRef' % tag
1934 if self._attrs.has_key(refname):
1935 return self._attrs[refname]
1939 return self._get_val('lustreName')
1942 return self._get_val('uuid')
1944 def get_route_tbl(self):
1947 ############################################################
1949 # FIXME: clean this mess up!
1951 # OSC is no longer in the xml, so we have to fake it.
1952 # this is getting ugly and begging for another refactoring
1953 def get_osc(ost_db, owner):
1954 osc = OSC(ost_db, owner)
1957 def get_mdc(db, owner, mds_uuid):
1958 mds_db = db.lookup(mds_uuid);
1960 panic("no mds:", mds_uuid)
1961 mdc = MDC(mds_db, owner)
1964 def prepare_mdc(db, owner, mds_uuid):
1965 mdc = get_mdc(db, owner, mds_uuid)
1969 def cleanup_mdc(db, owner, mds_uuid):
1970 mdc = get_mdc(db, owner, mds_uuid)
1974 ############################################################
1975 # routing ("rooting")
1981 def add_local_interfaces(node_db):
1983 for netuuid in node_db.get_networks():
1984 net = node_db.lookup(netuuid)
1986 debug("add_local", netuuid)
1987 local_node.append((srv.net_type, srv.nid))
1988 if acceptors.has_key(srv.port):
1989 panic("duplicate port:", srv.port)
1990 if srv.net_type in ('tcp', 'toe'):
1991 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1992 srv.send_mem, srv.recv_mem,
1996 def node_needs_router():
1999 def init_route_config(lustre):
2000 """ Scan the lustre config looking for routers. Build list of
2002 global routes, router_flag
2004 list = lustre.lookup_class('node')
2005 for node_db in list:
2006 if node_db.get_val_int('router', 0):
2008 #debug("init_route_config: found router", node_db.getName())
2009 for (local_type, local_nid) in local_node:
2010 #debug("init_route_config:", local_type, local_nid)
2012 for netuuid in node_db.get_networks():
2013 db = node_db.lookup(netuuid)
2014 if local_type == db.get_val('nettype'):
2015 gw = db.get_val('nid')
2017 #debug("init_route_config: gw is", gw)
2020 for route in node_db.get_routes(local_type, gw):
2021 routes.append(route)
2022 debug("init_route_config routes:", routes)
2025 def local_net(srv_list):
2027 for iface in local_node:
2028 for srv in srv_list:
2029 #debug("local_net a:", srv.net_type, "b:", iface[0])
2030 if srv.net_type == iface[0]:
2034 def local_net_type(net_type):
2036 for iface in local_node:
2037 if net_type == iface[0]:
2041 def find_route(srv_list):
2042 global local_node, routes
2043 frm_type = local_node[0][0]
2044 for srv in srv_list:
2045 #debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
2046 to_type = srv.net_type
2048 #debug ('looking for route to', to_type, to)
2050 #debug("find_route: ", r)
2056 ############################################################
2060 type = db.get_class()
2061 debug('Service:', type, db.getName(), db.getUUID())
2065 elif type == 'ptlrpc':
2069 elif type == 'network':
2071 elif type == 'routetbl':
2075 elif type == 'cobd':
2077 elif type == 'mdsdev':
2079 elif type == 'mountpoint':
2081 elif type == 'echoclient':
2084 panic ("unknown service type:", type)
2088 # Prepare the system to run lustre using a particular profile
2089 # in a the configuration.
2090 # * load & the modules
2091 # * setup networking for the current node
2092 # * make sure partitions are in place and prepared
2093 # * initialize devices with lctl
2094 # Levels is important, and needs to be enforced.
2095 def for_each_profile(db, prof_list, operation):
2096 for prof_uuid in prof_list:
2097 prof_db = db.lookup(prof_uuid)
2099 panic("profile:", profile, "not found.")
2100 services = prof_db.getServices()
2103 def doSetup(services):
2104 if config.nosetup():
2107 n = newService(s[1])
2110 def doModules(services):
2114 n = newService(s[1])
2117 def doCleanup(services):
2118 if config.nosetup():
2122 n = newService(s[1])
2125 def doUnloadModules(services):
2130 n = newService(s[1])
2135 def doHost(lustreDB, hosts):
2140 node_db = lustreDB.lookup_name(h, 'node')
2144 print 'No host entry found.'
2147 router_flag = node_db.get_val_int('router', 0)
2148 recovery_upcall = node_db.get_val('recovery_upcall', '')
2149 timeout = node_db.get_val_int('timeout', 0)
2151 add_local_interfaces(node_db)
2153 init_route_config(lustreDB)
2155 # Two step process: (1) load modules, (2) setup lustre
2156 # if not cleaning, load modules first.
2157 prof_list = node_db.get_refs('profile')
2159 if config.cleanup():
2161 # the command line can override this value
2163 # ugly hack, only need to run lctl commands for --dump
2164 if config.lctl_dump():
2165 for_each_profile(node_db, prof_list, doCleanup)
2168 sys_set_timeout(timeout)
2169 sys_set_recovery_upcall(recovery_upcall)
2171 for_each_profile(node_db, prof_list, doCleanup)
2172 for_each_profile(node_db, prof_list, doUnloadModules)
2175 # ugly hack, only need to run lctl commands for --dump
2176 if config.lctl_dump():
2177 for_each_profile(node_db, prof_list, doSetup)
2180 for_each_profile(node_db, prof_list, doModules)
2182 sys_set_debug_path()
2183 script = config.gdb_script()
2184 run(lctl.lctl, ' modules >', script)
2186 log ("The GDB module script is in", script)
2187 # pause, so user has time to break and
2190 sys_set_timeout(timeout)
2191 sys_set_recovery_upcall(recovery_upcall)
2193 for_each_profile(node_db, prof_list, doSetup)
2195 ############################################################
2196 # Command line processing
2198 def parse_cmdline(argv):
2199 short_opts = "hdnvf"
2200 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
2201 "portals=", "makeldiff", "cleanup", "noexec",
2202 "help", "node=", "nomod", "nosetup",
2203 "dump=", "force", "minlevel=", "maxlevel=",
2204 "timeout=", "recovery_upcall=",
2205 "ldapurl=", "config=", "select=", "lctl_dump="]
2210 opts, args = getopt.getopt(argv, short_opts, long_opts)
2211 except getopt.error:
2216 if o in ("-h", "--help"):
2218 if o in ("-d","--cleanup"):
2220 if o in ("-v", "--verbose"):
2222 if o in ("-n", "--noexec"):
2224 if o == "--portals":
2225 config.portals_dir(a)
2227 config.lustre_dir(a)
2228 if o == "--reformat":
2236 if o == "--nosetup":
2240 if o in ("-f", "--force"):
2242 if o == "--minlevel":
2244 if o == "--maxlevel":
2246 if o == "--timeout":
2248 if o == "--recovery_upcall":
2249 config.recovery_upcall(a)
2250 if o == "--ldapurl":
2253 config.config_name(a)
2255 config.init_select(a)
2256 if o == "--lctl_dump":
2265 s = urllib.urlopen(url)
2271 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2272 base = os.path.dirname(cmd)
2273 if os.access(base+"/Makefile", os.R_OK):
2274 if not config.lustre_dir():
2275 config.lustre_dir(os.path.join(base, ".."))
2276 # normalize the portals dir, using command line arg if set
2277 if config.portals_dir():
2278 portals_dir = config.portals_dir()
2279 dir = os.path.join(config.lustre_dir(), portals_dir)
2280 config.portals_dir(dir)
2281 elif config.lustre_dir() and config.portals_dir():
2283 # if --lustre and --portals, normalize portals
2284 # can ignore POTRALS_DIR here, since it is probly useless here
2285 dir = config.portals_dir()
2286 dir = os.path.join(config.lustre_dir(), dir)
2287 config.portals_dir(dir)
2289 def sysctl(path, val):
2293 fp = open(os.path.join('/proc/sys', path), 'w')
2300 def sys_set_debug_path():
2301 debug("debug path: ", config.debug_path())
2302 sysctl('portals/debug_path', config.debug_path())
2304 def sys_set_recovery_upcall(upcall):
2305 # the command overrides the value in the node config
2306 if config.recovery_upcall():
2307 upcall = config.recovery_upcall()
2309 debug("setting recovery_upcall:", upcall)
2310 sysctl('lustre/recovery_upcall', upcall)
2312 def sys_set_timeout(timeout):
2313 # the command overrides the value in the node config
2314 if config.timeout() > 0:
2315 timeout = config.timeout()
2317 debug("setting timeout:", timeout)
2318 sysctl('lustre/timeout', timeout)
2320 def sys_set_ptldebug(ptldebug):
2321 # the command overrides the value in the node config
2322 if config.ptldebug():
2323 ptldebug = config.ptldebug()
2324 sysctl('portals/debug', ptldebug)
2326 def sys_set_netmem_max(path, max):
2327 debug("setting", path, "to at least", max)
2335 fp = open(path, 'w')
2336 fp.write('%d\n' %(max))
2340 def sys_make_devices():
2341 if not os.access('/dev/portals', os.R_OK):
2342 run('mknod /dev/portals c 10 240')
2343 if not os.access('/dev/obd', os.R_OK):
2344 run('mknod /dev/obd c 10 241')
2347 # Add dir to the global PATH, if not already there.
2348 def add_to_path(new_dir):
2349 syspath = string.split(os.environ['PATH'], ':')
2350 if new_dir in syspath:
2352 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2355 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2356 # ensure basic elements are in the system path
2357 def sanitise_path():
2358 for dir in DEFAULT_PATH:
2361 # Initialize or shutdown lustre according to a configuration file
2362 # * prepare the system for lustre
2363 # * configure devices with lctl
2364 # Shutdown does steps in reverse
2367 global lctl, MAXTCPBUF
2369 host = socket.gethostname()
2371 # the PRNG is normally seeded with time(), which is not so good for starting
2372 # time-synchronized clusters
2373 input = open('/dev/urandom', 'r')
2375 print 'Unable to open /dev/urandom!'
2377 seed = input.read(32)
2383 args = parse_cmdline(sys.argv[1:])
2385 if not os.access(args[0], os.R_OK):
2386 print 'File not found or readable:', args[0]
2389 dom = xml.dom.minidom.parse(args[0])
2391 panic("%s does not appear to be a config file." % (args[0]))
2392 sys.exit(1) # make sure to die here, even in debug mode.
2393 db = LustreDB_XML(dom.documentElement, dom.documentElement)
2394 elif config.ldapurl():
2395 if not config.config_name():
2396 panic("--ldapurl requires --config name")
2397 dn = "config=%s,fs=lustre" % (config.config_name())
2398 db = LustreDB_LDAP('', {}, base=dn, url = config.ldapurl())
2404 node_list.append(config.node())
2407 node_list.append(host)
2408 node_list.append('localhost')
2409 debug("configuring for host: ", node_list)
2412 config._debug_path = config._debug_path + '-' + host
2413 config._gdb_script = config._gdb_script + '-' + host
2415 setupModulePath(sys.argv[0])
2417 lctl = LCTLInterface('lctl')
2418 if config.lctl_dump():
2419 lctl.use_save_file(config.lctl_dump())
2422 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2423 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2425 doHost(db, node_list)
2427 if __name__ == "__main__":
2430 except LconfError, e:
2432 except CommandError, e:
2436 if first_cleanup_error:
2437 sys.exit(first_cleanup_error)