3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time, random
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --minlevel <num> Specify the minimum level of services to configure/cleanup (default 0)
68 --maxlevel <num> Specify the maximum level of services to configure/cleanup (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def minlevel(self, val = None):
173 if val: self._minlevel = int(val)
174 return self._minlevel
176 def maxlevel(self, val = None):
177 if val: self._maxlevel = int(val)
178 return self._maxlevel
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 debug("+", self.lctl, cmd)
290 rc, out = run(self.lctl, cmd)
292 raise CommandError(self.lctl, out, rc)
296 def network(self, net, nid):
297 """ initialized network and add "self" """
298 # Idea: "mynid" could be used for all network types to add "self," and then
299 # this special case would be gone and the "self" hack would be hidden.
300 if net in ('tcp', 'toe'):
305 quit""" % (net, nid, nid)
314 # create a new connection
315 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
316 if net in ('tcp', 'toe'):
323 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
329 quit""" % (net, servuuid, nid, nid, port, )
333 # add a route to a range
334 def add_route(self, net, gw, lo, hi):
338 quit """ % (net, gw, lo, hi)
342 def del_route(self, net, gw, lo, hi):
350 # add a route to a host
351 def add_route_host(self, net, uuid, gw, tgt):
356 quit """ % (net, uuid, tgt, gw, tgt)
359 # add a route to a range
360 def del_route_host(self, net, uuid, gw, tgt):
366 quit """ % (net, uuid, tgt)
369 # disconnect one connection
370 def disconnect(self, net, nid, port, servuuid):
376 quit""" % (net, nid, servuuid)
380 def disconnectAll(self, net):
389 # create a new device with lctl
390 def newdev(self, attach, setup = ""):
395 quit""" % (attach, setup)
399 def cleanup(self, name, uuid):
405 quit""" % (name, ('', 'force')[config.force()])
409 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
413 lov_setconfig %s %d %d %d %s %s
414 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
418 def dump(self, dump_file):
421 quit""" % (dump_file)
424 # get list of devices
425 def device_list(self):
426 rc, out = self.runcmd('device_list')
429 # ============================================================
430 # Various system-level functions
431 # (ideally moved to their own module)
433 # Run a command and return the output and status.
434 # stderr is sent to /dev/null, could use popen3 to
435 # save it if necessary
437 cmd = string.join(map(str,args))
439 if config.noexec(): return (0, [])
440 f = os.popen(cmd + ' 2>&1')
449 # Run a command in the background.
450 def run_daemon(*args):
451 cmd = string.join(map(str,args))
453 if config.noexec(): return 0
454 f = os.popen(cmd + ' 2>&1')
462 # Determine full path to use for an external command
463 # searches dirname(argv[0]) first, then PATH
465 syspath = string.split(os.environ['PATH'], ':')
466 cmdpath = os.path.dirname(sys.argv[0])
467 syspath.insert(0, cmdpath);
468 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
470 prog = os.path.join(d,cmd)
471 if os.access(prog, os.X_OK):
475 # Recursively look for file starting at base dir
476 def do_find_file(base, mod):
477 fullname = os.path.join(base, mod)
478 if os.access(fullname, os.R_OK):
480 for d in os.listdir(base):
481 dir = os.path.join(base,d)
482 if os.path.isdir(dir):
483 module = do_find_file(dir, mod)
487 def find_module(src_dir, dev_dir, modname):
488 mod = '%s.o' % (modname)
489 module = src_dir +'/'+ dev_dir +'/'+ mod
491 if os.access(module, os.R_OK):
497 # is the path a block device?
504 return stat.S_ISBLK(s[stat.ST_MODE])
506 # build fs according to type
508 def mkfs(fstype, dev):
509 if(fstype in ('ext3', 'extN')):
510 mkfs = 'mkfs.ext2 -j -b 4096'
512 print 'unsupported fs type: ', fstype
513 if not is_block(dev):
517 (ret, out) = run (mkfs, force, dev)
519 panic("Unable to build fs:", dev)
520 # enable hash tree indexing on fsswe
521 # FIXME: this check can probably go away on 2.5
523 htree = 'echo "feature FEATURE_C5" | debugfs -w'
524 (ret, out) = run (htree, dev)
526 panic("Unable to enable htree:", dev)
528 # some systems use /dev/loopN, some /dev/loop/N
532 if not os.access(loop + str(0), os.R_OK):
534 if not os.access(loop + str(0), os.R_OK):
535 panic ("can't access loop devices")
538 # find loop device assigned to thefile
541 for n in xrange(0, MAX_LOOP_DEVICES):
543 if os.access(dev, os.R_OK):
544 (stat, out) = run('losetup', dev)
545 if (out and stat == 0):
546 m = re.search(r'\((.*)\)', out[0])
547 if m and file == m.group(1):
553 # create file if necessary and assign the first free loop device
554 def init_loop(file, size, fstype):
555 dev = find_loop(file)
557 print 'WARNING file:', file, 'already mapped to', dev
559 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
560 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
562 # find next free loop
563 for n in xrange(0, MAX_LOOP_DEVICES):
565 if os.access(dev, os.R_OK):
566 (stat, out) = run('losetup', dev)
568 run('losetup', dev, file)
571 print "out of loop devices"
573 print "out of loop devices"
576 # undo loop assignment
577 def clean_loop(file):
578 dev = find_loop(file)
580 ret, out = run('losetup -d', dev)
582 log('unable to clean loop device:', dev, 'for file:', file)
585 # determine if dev is formatted as a <fstype> filesystem
586 def need_format(fstype, dev):
587 # FIXME don't know how to implement this
590 # initialize a block device if needed
591 def block_dev(dev, size, fstype, format):
592 if config.noexec(): return dev
593 if not is_block(dev):
594 dev = init_loop(dev, size, fstype)
595 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
599 # panic("device:", dev,
600 # "not prepared, and autoformat is not set.\n",
601 # "Rerun with --reformat option to format ALL filesystems")
606 """lookup IP address for an interface"""
607 rc, out = run("/sbin/ifconfig", iface)
610 addr = string.split(out[1])[1]
611 ip = string.split(addr, ':')[1]
614 def get_local_address(net_type, wildcard):
615 """Return the local address for the network type."""
617 if net_type in ('tcp', 'toe'):
619 iface, star = string.split(wildcard, ':')
620 local = if2addr(iface)
622 panic ("unable to determine ip for:", wildcard)
624 host = socket.gethostname()
625 local = socket.gethostbyname(host)
626 elif net_type == 'elan':
627 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
629 fp = open('/proc/elan/device0/position', 'r')
630 lines = fp.readlines()
639 elif net_type == 'gm':
640 fixme("automatic local address for GM")
644 def is_prepared(uuid):
645 """Return true if a device exists for the uuid"""
646 # expect this format:
647 # 1 UP ldlm ldlm ldlm_UUID 2
649 out = lctl.device_list()
651 if uuid == string.split(s)[4]:
653 except CommandError, e:
658 # ============================================================
659 # Classes to prepare and cleanup the various objects
662 """ Base class for the rest of the modules. The default cleanup method is
663 defined here, as well as some utilitiy funcs.
665 def __init__(self, module_name, dom_node):
666 self.dom_node = dom_node
667 self.module_name = module_name
668 self.name = get_attr(dom_node, 'name')
669 self.uuid = get_attr(dom_node, 'uuid')
670 self.kmodule_list = []
674 def info(self, *args):
675 msg = string.join(map(str,args))
676 print self.module_name + ":", self.name, self.uuid, msg
679 def lookup_server(self, srv_uuid):
680 """ Lookup a server's network information """
681 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
683 panic ("Unable to find a server for:", srv_uuid)
684 self._server = Network(net)
686 def get_server(self):
690 """ default cleanup, used for most modules """
692 srv = self.get_server()
693 if srv and local_net(srv):
695 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
696 except CommandError, e:
697 log(self.module_name, "disconnect failed: ", self.name)
701 lctl.cleanup(self.name, self.uuid)
702 except CommandError, e:
703 log(self.module_name, "cleanup failed: ", self.name)
707 def add_module(self, dev_dir, modname):
708 """Append a module to list of modules to load."""
709 self.kmodule_list.append((dev_dir, modname))
711 def mod_loaded(self, modname):
712 """Check if a module is already loaded. Look in /proc/modules for it."""
713 fp = open('/proc/modules')
714 lines = fp.readlines()
716 # please forgive my tired fingers for this one
717 ret = filter(lambda word, mod=modname: word == mod,
718 map(lambda line: string.split(line)[0], lines))
721 def load_module(self):
722 """Load all the modules in the list in the order they appear."""
723 for dev_dir, mod in self.kmodule_list:
724 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
725 if self.mod_loaded(mod) and not config.noexec():
727 log ('loading module:', mod)
729 module = find_module(config.src_dir(),dev_dir, mod)
731 panic('module not found:', mod)
732 (rc, out) = run('/sbin/insmod', module)
734 raise CommandError('insmod', out, rc)
736 (rc, out) = run('/sbin/modprobe', mod)
738 raise CommandError('modprobe', out, rc)
740 def cleanup_module(self):
741 """Unload the modules in the list in reverse order."""
742 rev = self.kmodule_list
744 for dev_dir, mod in rev:
745 if not self.mod_loaded(mod):
748 if mod == 'portals' and config.dump_file():
749 lctl.dump(config.dump_file())
750 log('unloading module:', mod)
753 (rc, out) = run('/sbin/rmmod', mod)
755 log('! unable to unload module:', mod)
759 class Network(Module):
760 def __init__(self,dom_node):
761 Module.__init__(self, 'NETWORK', dom_node)
762 self.net_type = get_attr(dom_node,'type')
763 self.nid = get_text(dom_node, 'server', '*')
764 self.port = get_text_int(dom_node, 'port', 0)
765 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
766 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
768 self.nid = get_local_address(self.net_type, self.nid)
770 panic("unable to set nid for", self.net_type, self.nid)
771 debug("nid:", self.nid)
773 self.add_module('portals/linux/oslib/', 'portals')
774 if node_needs_router():
775 self.add_module('portals/linux/router', 'kptlrouter')
776 if self.net_type == 'tcp':
777 self.add_module('portals/linux/socknal', 'ksocknal')
778 if self.net_type == 'toe':
779 self.add_module('portals/linux/toenal', 'ktoenal')
780 if self.net_type == 'elan':
781 self.add_module('portals/linux/rqswnal', 'kqswnal')
782 if self.net_type == 'gm':
783 self.add_module('portals/linux/gmnal', 'kgmnal')
784 self.add_module('lustre/obdclass', 'obdclass')
785 self.add_module('lustre/ptlrpc', 'ptlrpc')
788 self.info(self.net_type, self.nid, self.port)
789 if self.net_type in ('tcp', 'toe'):
790 nal_id = '' # default is socknal
791 if self.net_type == 'toe':
793 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
795 raise CommandError(TCP_ACCEPTOR, out, ret)
796 ret = self.dom_node.getElementsByTagName('route_tbl')
798 for r in a.getElementsByTagName('route'):
799 net_type = get_attr(r, 'type')
800 gw = get_attr(r, 'gw')
801 lo = get_attr(r, 'lo')
802 hi = get_attr(r,'hi', '')
803 lctl.add_route(net_type, gw, lo, hi)
804 if net_type in ('tcp', 'toe') and net_type == self.net_type and hi == '':
805 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
807 panic("no server for nid", lo)
809 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
812 lctl.network(self.net_type, self.nid)
813 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
816 self.info(self.net_type, self.nid, self.port)
817 ret = self.dom_node.getElementsByTagName('route_tbl')
819 for r in a.getElementsByTagName('route'):
820 lo = get_attr(r, 'lo')
821 hi = get_attr(r,'hi', '')
822 if self.net_type in ('tcp', 'toe') and hi == '':
823 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
825 panic("no server for nid", lo)
828 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
829 except CommandError, e:
830 print "disconnect failed: ", self.name
834 lctl.del_route(self.net_type, self.nid, lo, hi)
835 except CommandError, e:
836 print "del_route failed: ", self.name
841 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
842 except CommandError, e:
843 print "cleanup failed: ", self.name
847 lctl.disconnectAll(self.net_type)
848 except CommandError, e:
849 print "disconnectAll failed: ", self.name
852 if self.net_type in ('tcp', 'toe'):
853 # yikes, this ugly! need to save pid in /var/something
854 run("killall acceptor")
857 def __init__(self,dom_node):
858 Module.__init__(self, 'LDLM', dom_node)
859 self.add_module('lustre/ldlm', 'ldlm')
861 if is_prepared(self.uuid):
864 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
868 def __init__(self,dom_node):
869 Module.__init__(self, 'LOV', dom_node)
870 self.mds_uuid = get_first_ref(dom_node, 'mds')
871 mds= lookup(dom_node.parentNode, self.mds_uuid)
872 self.mds_name = getName(mds)
873 devs = dom_node.getElementsByTagName('devices')
876 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
877 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
878 self.pattern = get_attr_int(dev_node, 'pattern', 0)
879 self.devlist = get_all_refs(dev_node, 'osc')
880 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
881 self.add_module('lustre/mdc', 'mdc')
882 self.add_module('lustre/lov', 'lov')
885 if is_prepared(self.uuid):
887 for osc_uuid in self.devlist:
888 osc = lookup(self.dom_node.parentNode, osc_uuid)
893 panic('osc not found:', osc_uuid)
894 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
895 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
896 self.stripe_off, self.pattern, self.devlist, self.mds_name)
897 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
898 setup ="%s" % (mdc_uuid))
901 if not is_prepared(self.uuid):
903 for osc_uuid in self.devlist:
904 osc = lookup(self.dom_node.parentNode, osc_uuid)
909 panic('osc not found:', osc_uuid)
911 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
914 def load_module(self):
915 for osc_uuid in self.devlist:
916 osc = lookup(self.dom_node.parentNode, osc_uuid)
922 panic('osc not found:', osc_uuid)
923 Module.load_module(self)
926 def cleanup_module(self):
927 Module.cleanup_module(self)
928 for osc_uuid in self.devlist:
929 osc = lookup(self.dom_node.parentNode, osc_uuid)
935 panic('osc not found:', osc_uuid)
937 class LOVConfig(Module):
938 def __init__(self,dom_node):
939 Module.__init__(self, 'LOVConfig', dom_node)
940 self.lov_uuid = get_first_ref(dom_node, 'lov')
941 l = lookup(dom_node.parentNode, self.lov_uuid)
946 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
947 lov.pattern, lov.devlist, lov.mds_name)
948 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
949 lov.stripe_sz, lov.stripe_off, lov.pattern,
950 string.join(lov.devlist))
958 def __init__(self,dom_node):
959 Module.__init__(self, 'MDS', dom_node)
960 self.devname, self.size = get_device(dom_node)
961 self.fstype = get_text(dom_node, 'fstype')
962 # FIXME: if fstype not set, then determine based on kernel version
963 self.format = get_text(dom_node, 'autoformat', "no")
964 if self.fstype == 'extN':
965 self.add_module('lustre/extN', 'extN')
966 self.add_module('lustre/mds', 'mds')
967 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
970 if is_prepared(self.uuid):
972 self.info(self.devname, self.fstype, self.format)
973 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
974 if not is_prepared('MDT_UUID'):
975 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
977 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
978 setup ="%s %s" %(blkdev, self.fstype))
980 if is_prepared('MDT_UUID'):
982 lctl.cleanup("MDT", "MDT_UUID")
983 except CommandError, e:
984 print "cleanup failed: ", self.name
987 if not is_prepared(self.uuid):
990 clean_loop(self.devname)
992 # Very unusual case, as there is no MDC element in the XML anymore
993 # Builds itself from an MDS node
995 def __init__(self,dom_node):
996 self.mds = MDS(dom_node)
997 self.dom_node = dom_node
998 self.module_name = 'MDC'
999 self.kmodule_list = []
1003 host = socket.gethostname()
1004 self.name = 'MDC_%s' % (self.mds.name)
1005 self.uuid = '%s_%05x_UUID' % (self.name , int(random.random() * 100000))
1007 self.lookup_server(self.mds.uuid)
1008 self.add_module('lustre/mdc', 'mdc')
1011 if is_prepared(self.uuid):
1013 self.info(self.mds.uuid)
1014 srv = self.get_server()
1015 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1016 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
1017 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1020 def __init__(self, dom_node):
1021 Module.__init__(self, 'OBD', dom_node)
1022 self.obdtype = get_attr(dom_node, 'type')
1023 self.devname, self.size = get_device(dom_node)
1024 self.fstype = get_text(dom_node, 'fstype')
1025 # FIXME: if fstype not set, then determine based on kernel version
1026 self.format = get_text(dom_node, 'autoformat', 'yes')
1027 if self.fstype == 'extN':
1028 self.add_module('lustre/extN', 'extN')
1029 self.add_module('lustre/' + self.obdtype, self.obdtype)
1031 # need to check /proc/mounts and /etc/mtab before
1032 # formatting anything.
1033 # FIXME: check if device is already formatted.
1035 if is_prepared(self.uuid):
1037 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1038 if self.obdtype == 'obdecho':
1041 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1042 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1043 setup ="%s %s" %(blkdev, self.fstype))
1045 if not is_prepared(self.uuid):
1047 Module.cleanup(self)
1048 if not self.obdtype == 'obdecho':
1049 clean_loop(self.devname)
1052 def __init__(self,dom_node):
1053 Module.__init__(self, 'OST', dom_node)
1054 self.obd_uuid = get_first_ref(dom_node, 'obd')
1055 self.add_module('lustre/ost', 'ost')
1058 if is_prepared(self.uuid):
1060 self.info(self.obd_uuid)
1061 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1062 setup ="%s" % (self.obd_uuid))
1065 # virtual interface for OSC and LOV
1067 def __init__(self,dom_node):
1068 Module.__init__(self, 'VOSC', dom_node)
1069 if dom_node.nodeName == 'lov':
1070 self.osc = LOV(dom_node)
1072 self.osc = OSC(dom_node)
1077 def load_module(self):
1078 self.osc.load_module()
1079 def cleanup_module(self):
1080 self.osc.cleanup_module()
1084 def __init__(self,dom_node):
1085 Module.__init__(self, 'OSC', dom_node)
1086 self.obd_uuid = get_first_ref(dom_node, 'obd')
1087 self.ost_uuid = get_first_ref(dom_node, 'ost')
1088 self.lookup_server(self.ost_uuid)
1089 self.add_module('lustre/osc', 'osc')
1092 if is_prepared(self.uuid):
1094 self.info(self.obd_uuid, self.ost_uuid)
1095 srv = self.get_server()
1097 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1101 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1103 panic ("no route to", srv.nid)
1105 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1106 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1109 if not is_prepared(self.uuid):
1111 srv = self.get_server()
1113 Module.cleanup(self)
1115 self.info(self.obd_uuid, self.ost_uuid)
1119 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1120 except CommandError, e:
1121 print "del_route failed: ", self.name
1124 Module.cleanup(self)
1127 class Mountpoint(Module):
1128 def __init__(self,dom_node):
1129 Module.__init__(self, 'MTPT', dom_node)
1130 self.path = get_text(dom_node, 'path')
1131 self.mds_uuid = get_first_ref(dom_node, 'mds')
1132 self.lov_uuid = get_first_ref(dom_node, 'osc')
1133 self.add_module('lustre/mdc', 'mdc')
1134 self.add_module('lustre/llite', 'llite')
1135 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1140 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1141 self.info(self.path, self.mds_uuid, self.lov_uuid)
1142 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1143 (self.lov_uuid, mdc_uuid, self.path)
1144 run("mkdir", self.path)
1147 panic("mount failed:", self.path)
1150 self.info(self.path, self.mds_uuid,self.lov_uuid)
1152 (rc, out) = run("umount -f", self.path)
1154 (rc, out) = run("umount", self.path)
1156 log("umount failed, cleanup will most likely not work.")
1157 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1159 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1161 def load_module(self):
1162 self.osc.load_module()
1163 Module.load_module(self)
1164 def cleanup_module(self):
1165 Module.cleanup_module(self)
1166 self.osc.cleanup_module()
1169 # ============================================================
1170 # XML processing and query
1171 # TODO: Change query funcs to use XPath, which is muc cleaner
1173 def get_device(obd):
1174 list = obd.getElementsByTagName('device')
1178 size = get_attr_int(dev, 'size', 0)
1179 return dev.firstChild.data, size
1182 # Get the text content from the first matching child
1183 # If there is no content (or it is all whitespace), return
1185 def get_text(dom_node, tag, default=""):
1186 list = dom_node.getElementsByTagName(tag)
1189 dom_node.normalize()
1190 if dom_node.firstChild:
1191 txt = string.strip(dom_node.firstChild.data)
1196 def get_text_int(dom_node, tag, default=0):
1197 list = dom_node.getElementsByTagName(tag)
1201 dom_node.normalize()
1202 if dom_node.firstChild:
1203 txt = string.strip(dom_node.firstChild.data)
1208 panic("text value is not integer:", txt)
1211 def get_attr(dom_node, attr, default=""):
1212 v = dom_node.getAttribute(attr)
1217 def get_attr_int(dom_node, attr, default=0):
1219 v = dom_node.getAttribute(attr)
1224 panic("attr value is not integer", v)
1227 def get_first_ref(dom_node, tag):
1228 """ Get the first uuidref of the type TAG. Used one only
1229 one is expected. Returns the uuid."""
1231 refname = '%s_ref' % tag
1232 list = dom_node.getElementsByTagName(refname)
1234 uuid = getRef(list[0])
1237 def get_all_refs(dom_node, tag):
1238 """ Get all the refs of type TAG. Returns list of uuids. """
1240 refname = '%s_ref' % tag
1241 list = dom_node.getElementsByTagName(refname)
1244 uuids.append(getRef(i))
1247 def get_ost_net(dom_node, uuid):
1248 ost = lookup(dom_node, uuid)
1249 uuid = get_first_ref(ost, 'network')
1252 return lookup(dom_node, uuid)
1254 def nid2server(dom_node, nid):
1255 netlist = dom_node.getElementsByTagName('network')
1256 for net_node in netlist:
1257 if get_text(net_node, 'server') == nid:
1258 return Network(net_node)
1261 def lookup(dom_node, uuid):
1262 for n in dom_node.childNodes:
1263 if n.nodeType == n.ELEMENT_NODE:
1264 if getUUID(n) == uuid:
1271 # Get name attribute of dom_node
1272 def getName(dom_node):
1273 return dom_node.getAttribute('name')
1275 def getRef(dom_node):
1276 return dom_node.getAttribute('uuidref')
1278 # Get name attribute of dom_node
1279 def getUUID(dom_node):
1280 return dom_node.getAttribute('uuid')
1282 # the tag name is the service type
1283 # fixme: this should do some checks to make sure the dom_node is a service
1284 def getServiceType(dom_node):
1285 return dom_node.nodeName
1288 # determine what "level" a particular node is at.
1289 # the order of iniitailization is based on level.
1290 def getServiceLevel(dom_node):
1291 type = getServiceType(dom_node)
1293 if type in ('network',):
1295 elif type in ('device', 'ldlm'):
1297 elif type in ('obd', 'mdd'):
1299 elif type in ('mds','ost'):
1301 elif type in ('mdc','osc'):
1303 elif type in ('lov', 'lovconfig'):
1305 elif type in ('mountpoint',):
1308 if ret < config.minlevel() or ret > config.maxlevel():
1313 # return list of services in a profile. list is a list of tuples
1314 # [(level, dom_node),]
1315 def getServices(lustreNode, profileNode):
1317 for n in profileNode.childNodes:
1318 if n.nodeType == n.ELEMENT_NODE:
1319 servNode = lookup(lustreNode, getRef(n))
1322 panic('service not found: ' + getRef(n))
1323 level = getServiceLevel(servNode)
1325 list.append((level, servNode))
1329 def getByName(lustreNode, name, tag):
1330 ndList = lustreNode.getElementsByTagName(tag)
1332 if getName(nd) == name:
1337 ############################################################
1339 # FIXME: clean this mess up!
1342 def prepare_mdc(dom_node, mds_uuid):
1344 mds_node = lookup(dom_node, mds_uuid);
1346 panic("no mds:", mds_uuid)
1347 if saved_mdc.has_key(mds_uuid):
1348 return saved_mdc[mds_uuid]
1351 saved_mdc[mds_uuid] = mdc.uuid
1354 def cleanup_mdc(dom_node, mds_uuid):
1356 mds_node = lookup(dom_node, mds_uuid);
1358 panic("no mds:", mds_uuid)
1359 if not saved_mdc.has_key(mds_uuid):
1362 saved_mdc[mds_uuid] = mdc.uuid
1365 ############################################################
1366 # routing ("rooting")
1372 def init_node(dom_node):
1373 global local_node, router_flag
1374 netlist = dom_node.getElementsByTagName('network')
1375 for dom_net in netlist:
1376 type = get_attr(dom_net, 'type')
1377 gw = get_text(dom_net, 'server')
1378 local_node.append((type, gw))
1380 def node_needs_router():
1383 def get_routes(type, gw, dom_net):
1384 """ Return the routes as a list of tuples of the form:
1385 [(type, gw, lo, hi),]"""
1387 tbl = dom_net.getElementsByTagName('route_tbl')
1389 routes = t.getElementsByTagName('route')
1391 lo = get_attr(r, 'lo')
1392 hi = get_attr(r, 'hi', '')
1393 res.append((type, gw, lo, hi))
1397 def init_route_config(lustre):
1398 """ Scan the lustre config looking for routers. Build list of
1400 global routes, router_flag
1402 list = lustre.getElementsByTagName('node')
1404 if get_attr(node, 'router'):
1406 for (local_type, local_nid) in local_node:
1408 netlist = node.getElementsByTagName('network')
1409 for dom_net in netlist:
1410 if local_type == get_attr(dom_net, 'type'):
1411 gw = get_text(dom_net, 'server')
1415 for dom_net in netlist:
1416 if local_type != get_attr(dom_net, 'type'):
1417 for route in get_routes(local_type, gw, dom_net):
1418 routes.append(route)
1423 for iface in local_node:
1424 if net.net_type == iface[0]:
1428 def find_route(net):
1429 global local_node, routes
1430 frm_type = local_node[0][0]
1431 to_type = net.net_type
1433 debug ('looking for route to', to_type,to)
1442 ############################################################
1445 def startService(dom_node, module_flag):
1446 type = getServiceType(dom_node)
1447 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1448 # there must be a more dynamic way of doing this...
1454 elif type == 'lovconfig':
1455 n = LOVConfig(dom_node)
1456 elif type == 'network':
1457 n = Network(dom_node)
1468 elif type == 'mountpoint':
1469 n = Mountpoint(dom_node)
1471 panic ("unknown service type:", type)
1476 if config.cleanup():
1481 if config.nosetup():
1483 if config.cleanup():
1489 # Prepare the system to run lustre using a particular profile
1490 # in a the configuration.
1491 # * load & the modules
1492 # * setup networking for the current node
1493 # * make sure partitions are in place and prepared
1494 # * initialize devices with lctl
1495 # Levels is important, and needs to be enforced.
1496 def startProfile(lustreNode, profileNode, module_flag):
1498 panic("profile:", profile, "not found.")
1499 services = getServices(lustreNode, profileNode)
1500 if config.cleanup():
1503 startService(s[1], module_flag)
1508 def doHost(lustreNode, hosts):
1512 dom_node = getByName(lustreNode, h, 'node')
1517 print 'No host entry found.'
1520 if not get_attr(dom_node, 'router'):
1522 init_route_config(lustreNode)
1527 # Two step process: (1) load modules, (2) setup lustre
1528 # if not cleaning, load modules first.
1529 module_flag = not config.cleanup()
1530 reflist = dom_node.getElementsByTagName('profile')
1531 for profile in reflist:
1532 startProfile(lustreNode, profile, module_flag)
1534 if not config.cleanup():
1535 sys_set_debug_path()
1536 script = config.gdb_script()
1537 run(lctl.lctl, ' modules >', script)
1539 # dump /tmp/ogdb and sleep/pause here
1540 log ("The GDB module script is in", script)
1543 module_flag = not module_flag
1544 for profile in reflist:
1545 startProfile(lustreNode, profile, module_flag)
1547 ############################################################
1548 # Command line processing
1550 def parse_cmdline(argv):
1551 short_opts = "hdnvf"
1552 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1553 "portals=", "makeldiff", "cleanup", "noexec",
1554 "help", "node=", "nomod", "nosetup",
1555 "dump=", "force", "minlevel=", "maxlevel="]
1559 opts, args = getopt.getopt(argv, short_opts, long_opts)
1560 except getopt.error:
1565 if o in ("-h", "--help"):
1567 if o in ("-d","--cleanup"):
1569 if o in ("-v", "--verbose"):
1571 if o in ("-n", "--noexec"):
1574 if o == "--portals":
1578 if o == "--reformat":
1586 if o == "--nosetup":
1590 if o in ("-f", "--force"):
1592 if o in ("--minlevel",):
1594 if o in ("--maxlevel",):
1603 s = urllib.urlopen(url)
1609 def setupModulePath(cmd):
1610 base = os.path.dirname(cmd)
1611 if os.access(base+"/Makefile", os.R_OK):
1612 config.src_dir(base + "/../../")
1614 def sys_set_debug_path():
1615 debug("debug path: ", config.debug_path())
1619 fp = open('/proc/sys/portals/debug_path', 'w')
1620 fp.write(config.debug_path())
1625 #/proc/sys/net/core/rmem_max
1626 #/proc/sys/net/core/wmem_max
1627 def sys_set_netmem_max(path, max):
1628 debug("setting", path, "to at least", max)
1636 fp = open(path, 'w')
1637 fp.write('%d\n' %(max))
1641 def sys_make_devices():
1642 if not os.access('/dev/portals', os.R_OK):
1643 run('mknod /dev/portals c 10 240')
1644 if not os.access('/dev/obd', os.R_OK):
1645 run('mknod /dev/obd c 10 241')
1648 # Add dir to the global PATH, if not already there.
1649 def add_to_path(new_dir):
1650 syspath = string.split(os.environ['PATH'], ':')
1651 if new_dir in syspath:
1653 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1656 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1657 # ensure basic elements are in the system path
1658 def sanitise_path():
1659 for dir in DEFAULT_PATH:
1662 # Initialize or shutdown lustre according to a configuration file
1663 # * prepare the system for lustre
1664 # * configure devices with lctl
1665 # Shutdown does steps in reverse
1668 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1669 host = socket.gethostname()
1671 # the PRNG is normally seeded with time(), which is not so good for starting
1672 # time-synchronized clusters
1673 input = open('/dev/urandom', 'r')
1675 print 'Unable to open /dev/urandom!'
1677 seed = input.read(32)
1683 args = parse_cmdline(sys.argv[1:])
1685 if not os.access(args[0], os.R_OK):
1686 print 'File not found or readable:', args[0]
1688 dom = xml.dom.minidom.parse(args[0])
1690 xmldata = fetch(config.url())
1691 dom = xml.dom.minidom.parseString(xmldata)
1697 node_list.append(config.node())
1700 node_list.append(host)
1701 node_list.append('localhost')
1702 debug("configuring for host: ", node_list)
1705 config._debug_path = config._debug_path + '-' + host
1706 config._gdb_script = config._gdb_script + '-' + host
1708 TCP_ACCEPTOR = find_prog('acceptor')
1709 if not TCP_ACCEPTOR:
1711 TCP_ACCEPTOR = 'acceptor'
1712 debug('! acceptor not found')
1714 panic('acceptor not found')
1716 lctl = LCTLInterface('lctl')
1718 setupModulePath(sys.argv[0])
1720 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1721 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1722 doHost(dom.documentElement, node_list)
1724 if __name__ == "__main__":
1727 except LconfError, e:
1729 except CommandError, e:
1733 if first_cleanup_error:
1734 sys.exit(first_cleanup_error)