3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -f | --force Forced unmounting and/or obd detach during cleanup
56 -v | --verbose Print system commands as they are run
57 -h | --help Print this help
58 --gdb Prints message after creating gdb module script
59 and sleeps for 5 seconds.
60 -n | --noexec Prints the commands and steps that will be run for a
61 config without executing them. This can used to check if a
62 config file is doing what it should be doing. (Implies -v)
63 --nomod Skip load/unload module step.
64 --nosetup Skip device setup/cleanup step.
65 --reformat Reformat all devices (without question)
66 --dump <file> Dump the kernel debug log before portals is unloaded
67 --startlevel <num> Specify the level of services to start with (default 0)
68 --endlevel <num> Specify the level of services to end with (default 100)
69 Levels are aproximatly like:
79 --ldap server LDAP server with lustre config database
80 --makeldiff Translate xml source to LDIFF
81 This are perhaps not needed:
82 --lustre="src dir" Base directory of lustre sources. Used to search
84 --portals=src Portals source
88 # ============================================================
89 # Config parameters, encapsulated in a class
105 self._gdb_script = '/tmp/ogdb'
106 self._debug_path = '/tmp/lustre-log'
107 self._dump_file = None
109 self._start_level = 0
110 self._end_level = 100
112 def verbose(self, flag = None):
113 if flag: self._verbose = flag
116 def noexec(self, flag = None):
117 if flag: self._noexec = flag
120 def reformat(self, flag = None):
121 if flag: self._reformat = flag
122 return self._reformat
124 def cleanup(self, flag = None):
125 if flag: self._cleanup = flag
128 def gdb(self, flag = None):
129 if flag: self._gdb = flag
132 def nomod(self, flag = None):
133 if flag: self._nomod = flag
136 def nosetup(self, flag = None):
137 if flag: self._nosetup = flag
140 def force(self, flag = None):
141 if flag: self._force = flag
144 def node(self, val = None):
145 if val: self._node = val
148 def url(self, val = None):
149 if val: self._url = val
152 def gdb_script(self):
153 if os.path.isdir('/r'):
154 return '/r' + self._gdb_script
156 return self._gdb_script
158 def debug_path(self):
159 if os.path.isdir('/r'):
160 return '/r' + self._debug_path
162 return self._debug_path
164 def src_dir(self, val = None):
165 if val: self._src_dir = val
168 def dump_file(self, val = None):
169 if val: self._dump_file = val
170 return self._dump_file
172 def startlevel(self, val = None):
173 if val: self._start_level = int(val)
174 return self._start_level
176 def endlevel(self, val = None):
177 if val: self._end_level = int(val)
178 return self._end_level
184 # ============================================================
185 # debugging and error funcs
187 def fixme(msg = "this feature"):
188 raise LconfError, msg + ' not implmemented yet.'
191 msg = string.join(map(str,args))
192 if not config.noexec():
193 raise LconfError(msg)
198 msg = string.join(map(str,args))
203 print string.strip(s)
207 msg = string.join(map(str,args))
210 # ============================================================
211 # locally defined exceptions
212 class CommandError (exceptions.Exception):
213 def __init__(self, cmd_name, cmd_err, rc=None):
214 self.cmd_name = cmd_name
215 self.cmd_err = cmd_err
220 if type(self.cmd_err) == types.StringType:
222 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
224 print "! %s: %s" % (self.cmd_name, self.cmd_err)
225 elif type(self.cmd_err) == types.ListType:
227 print "! %s (error %d):" % (self.cmd_name, self.rc)
229 print "! %s:" % (self.cmd_name)
230 for s in self.cmd_err:
231 print "> %s" %(string.strip(s))
235 class LconfError (exceptions.Exception):
236 def __init__(self, args):
240 # ============================================================
241 # handle lctl interface
244 Manage communication with lctl
247 def __init__(self, cmd):
249 Initialize close by finding the lctl binary.
251 self.lctl = find_prog(cmd)
254 debug('! lctl not found')
257 raise CommandError('lctl', "unable to find lctl binary.")
262 the cmds are written to stdin of lctl
263 lctl doesn't return errors when run in script mode, so
265 should modify command line to accept multiple commands, or
266 create complex command line options
268 debug("+", self.lctl, cmds)
269 if config.noexec(): return (0, [])
270 p = popen2.Popen3(self.lctl, 1)
271 p.tochild.write(cmds + "\n")
273 out = p.fromchild.readlines()
274 err = p.childerr.readlines()
276 if os.WIFEXITED(ret):
277 rc = os.WEXITSTATUS(ret)
281 raise CommandError(self.lctl, err, rc)
284 def runcmd(self, *args):
286 run lctl using the command line
288 cmd = string.join(map(str,args))
289 rc, out = run(self.lctl, cmd)
291 raise CommandError(self.lctl, out, rc)
295 def network(self, net, nid):
296 """ initialized network and add "self" """
297 # Idea: "mynid" could be used for all network types to add "self," and then
298 # this special case would be gone and the "self" hack would be hidden.
304 quit""" % (net, nid, nid)
313 # create a new connection
314 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
322 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
328 quit""" % (net, servuuid, nid, nid, port, )
332 # add a route to a range
333 def add_route(self, net, gw, lo, hi):
337 quit """ % (net, gw, lo, hi)
341 def del_route(self, net, gw, lo, hi):
349 # add a route to a host
350 def add_route_host(self, net, uuid, gw, tgt):
355 quit """ % (net, uuid, tgt, gw, tgt)
358 # add a route to a range
359 def del_route_host(self, net, uuid, gw, tgt):
365 quit """ % (net, uuid, tgt)
368 # disconnect one connection
369 def disconnect(self, net, nid, port, servuuid):
375 quit""" % (net, nid, servuuid)
379 def disconnectAll(self, net):
388 # create a new device with lctl
389 def newdev(self, attach, setup = ""):
394 quit""" % (attach, setup)
398 def cleanup(self, name, uuid):
404 quit""" % (name, ('', 'force')[config.force()])
408 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
412 lov_setconfig %s %d %d %d %s %s
413 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
417 def dump(self, dump_file):
420 quit""" % (dump_file)
423 # get list of devices
424 def device_list(self):
425 rc, out = self.runcmd('device_list')
428 # ============================================================
429 # Various system-level functions
430 # (ideally moved to their own module)
432 # Run a command and return the output and status.
433 # stderr is sent to /dev/null, could use popen3 to
434 # save it if necessary
436 cmd = string.join(map(str,args))
438 if config.noexec(): return (0, [])
439 f = os.popen(cmd + ' 2>&1')
448 # Run a command in the background.
449 def run_daemon(*args):
450 cmd = string.join(map(str,args))
452 if config.noexec(): return 0
453 f = os.popen(cmd + ' 2>&1')
461 # Determine full path to use for an external command
462 # searches dirname(argv[0]) first, then PATH
464 syspath = string.split(os.environ['PATH'], ':')
465 cmdpath = os.path.dirname(sys.argv[0])
466 syspath.insert(0, cmdpath);
467 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
469 prog = os.path.join(d,cmd)
470 if os.access(prog, os.X_OK):
474 # Recursively look for file starting at base dir
475 def do_find_file(base, mod):
476 fullname = os.path.join(base, mod)
477 if os.access(fullname, os.R_OK):
479 for d in os.listdir(base):
480 dir = os.path.join(base,d)
481 if os.path.isdir(dir):
482 module = do_find_file(dir, mod)
486 def find_module(src_dir, dev_dir, modname):
487 mod = '%s.o' % (modname)
488 module = src_dir +'/'+ dev_dir +'/'+ mod
490 if os.access(module, os.R_OK):
496 # is the path a block device?
503 return stat.S_ISBLK(s[stat.ST_MODE])
505 # build fs according to type
507 def mkfs(fstype, dev):
508 if(fstype in ('ext3', 'extN')):
509 mkfs = 'mkfs.ext2 -j -b 4096'
511 print 'unsupported fs type: ', fstype
512 if not is_block(dev):
516 (ret, out) = run (mkfs, force, dev)
518 panic("Unable to build fs:", dev)
519 # enable hash tree indexing on fsswe
520 # FIXME: this check can probably go away on 2.5
522 htree = 'echo "feature FEATURE_C5" | debugfs -w'
523 (ret, out) = run (htree, dev)
525 panic("Unable to enable htree:", dev)
527 # some systems use /dev/loopN, some /dev/loop/N
531 if not os.access(loop + str(0), os.R_OK):
533 if not os.access(loop + str(0), os.R_OK):
534 panic ("can't access loop devices")
537 # find loop device assigned to thefile
540 for n in xrange(0, MAX_LOOP_DEVICES):
542 if os.access(dev, os.R_OK):
543 (stat, out) = run('losetup', dev)
544 if (out and stat == 0):
545 m = re.search(r'\((.*)\)', out[0])
546 if m and file == m.group(1):
552 # create file if necessary and assign the first free loop device
553 def init_loop(file, size, fstype):
554 dev = find_loop(file)
556 print 'WARNING file:', file, 'already mapped to', dev
558 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
559 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
561 # find next free loop
562 for n in xrange(0, MAX_LOOP_DEVICES):
564 if os.access(dev, os.R_OK):
565 (stat, out) = run('losetup', dev)
567 run('losetup', dev, file)
570 print "out of loop devices"
572 print "out of loop devices"
575 # undo loop assignment
576 def clean_loop(file):
577 dev = find_loop(file)
579 ret, out = run('losetup -d', dev)
581 log('unable to clean loop device:', dev, 'for file:', file)
584 # determine if dev is formatted as a <fstype> filesystem
585 def need_format(fstype, dev):
586 # FIXME don't know how to implement this
589 # initialize a block device if needed
590 def block_dev(dev, size, fstype, format):
591 if config.noexec(): return dev
592 if not is_block(dev):
593 dev = init_loop(dev, size, fstype)
594 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
598 # panic("device:", dev,
599 # "not prepared, and autoformat is not set.\n",
600 # "Rerun with --reformat option to format ALL filesystems")
605 """lookup IP address for an interface"""
606 rc, out = run("/sbin/ifconfig", iface)
609 addr = string.split(out[1])[1]
610 ip = string.split(addr, ':')[1]
613 def get_local_address(net_type, wildcard):
614 """Return the local address for the network type."""
616 if net_type == 'tcp':
618 iface, star = string.split(wildcard, ':')
619 local = if2addr(iface)
621 panic ("unable to determine ip for:", wildcard)
623 host = socket.gethostname()
624 local = socket.gethostbyname(host)
625 elif net_type == 'elan':
626 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
628 fp = open('/proc/elan/device0/position', 'r')
629 lines = fp.readlines()
638 elif net_type == 'gm':
639 fixme("automatic local address for GM")
643 def is_prepared(uuid):
644 """Return true if a device exists for the uuid"""
645 # expect this format:
646 # 1 UP ldlm ldlm ldlm_UUID 2
647 out = lctl.device_list()
649 if uuid == string.split(s)[4]:
654 # ============================================================
655 # Classes to prepare and cleanup the various objects
658 """ Base class for the rest of the modules. The default cleanup method is
659 defined here, as well as some utilitiy funcs.
661 def __init__(self, module_name, dom_node):
662 self.dom_node = dom_node
663 self.module_name = module_name
664 self.name = get_attr(dom_node, 'name')
665 self.uuid = get_attr(dom_node, 'uuid')
666 self.kmodule_list = []
670 def info(self, *args):
671 msg = string.join(map(str,args))
672 print self.module_name + ":", self.name, self.uuid, msg
675 def lookup_server(self, srv_uuid):
676 """ Lookup a server's network information """
677 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
679 panic ("Unable to find a server for:", srv_uuid)
680 self._server = Network(net)
682 def get_server(self):
686 """ default cleanup, used for most modules """
688 srv = self.get_server()
689 if srv and local_net(srv):
691 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
692 except CommandError, e:
693 log(self.module_name, "disconnect failed: ", self.name)
697 lctl.cleanup(self.name, self.uuid)
698 except CommandError, e:
699 log(self.module_name, "cleanup failed: ", self.name)
703 def add_module(self, dev_dir, modname):
704 """Append a module to list of modules to load."""
705 self.kmodule_list.append((dev_dir, modname))
707 def mod_loaded(self, modname):
708 """Check if a module is already loaded. Look in /proc/modules for it."""
709 fp = open('/proc/modules')
710 lines = fp.readlines()
712 # please forgive my tired fingers for this one
713 ret = filter(lambda word, mod=modname: word == mod,
714 map(lambda line: string.split(line)[0], lines))
717 def load_module(self):
718 """Load all the modules in the list in the order they appear."""
719 for dev_dir, mod in self.kmodule_list:
720 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
721 if self.mod_loaded(mod) and not config.noexec():
723 log ('loading module:', mod)
725 module = find_module(config.src_dir(),dev_dir, mod)
727 panic('module not found:', mod)
728 (rc, out) = run('/sbin/insmod', module)
730 raise CommandError('insmod', out, rc)
732 (rc, out) = run('/sbin/modprobe', mod)
734 raise CommandError('modprobe', out, rc)
736 def cleanup_module(self):
737 """Unload the modules in the list in reverse order."""
738 rev = self.kmodule_list
740 for dev_dir, mod in rev:
741 if not self.mod_loaded(mod):
744 if mod == 'portals' and config.dump_file():
745 lctl.dump(config.dump_file())
746 log('unloading module:', mod)
749 (rc, out) = run('/sbin/rmmod', mod)
751 log('! unable to unload module:', mod)
755 class Network(Module):
756 def __init__(self,dom_node):
757 Module.__init__(self, 'NETWORK', dom_node)
758 self.net_type = get_attr(dom_node,'type')
759 self.nid = get_text(dom_node, 'server', '*')
760 self.port = get_text_int(dom_node, 'port', 0)
761 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
762 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
764 self.nid = get_local_address(self.net_type, self.nid)
766 panic("unable to set nid for", self.net_type, self.nid)
767 debug("nid:", self.nid)
769 self.add_module('portals/linux/oslib/', 'portals')
770 if node_needs_router():
771 self.add_module('portals/linux/router', 'kptlrouter')
772 if self.net_type == 'tcp':
773 self.add_module('portals/linux/socknal', 'ksocknal')
774 if self.net_type == 'elan':
775 self.add_module('portals/linux/rqswnal', 'kqswnal')
776 if self.net_type == 'gm':
777 self.add_module('portals/linux/gmnal', 'kgmnal')
778 self.add_module('lustre/obdclass', 'obdclass')
779 self.add_module('lustre/ptlrpc', 'ptlrpc')
782 self.info(self.net_type, self.nid, self.port)
783 if self.net_type in ('tcp', 'toe'):
784 nal_id = '' # default is socknal
785 if self.net_type == 'toe':
787 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, nal_id, self.port)
789 raise CommandError(TCP_ACCEPTOR, out, ret)
790 ret = self.dom_node.getElementsByTagName('route_tbl')
792 for r in a.getElementsByTagName('route'):
793 net_type = get_attr(r, 'type')
794 gw = get_attr(r, 'gw')
795 lo = get_attr(r, 'lo')
796 hi = get_attr(r,'hi', '')
797 lctl.add_route(net_type, gw, lo, hi)
798 if net_type == 'tcp' and net_type == self.net_type and hi == '':
799 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
801 panic("no server for nid", lo)
803 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
806 lctl.network(self.net_type, self.nid)
807 lctl.newdev(attach = "ptlrpc RPCDEV RPCDEV_UUID")
810 self.info(self.net_type, self.nid, self.port)
811 ret = self.dom_node.getElementsByTagName('route_tbl')
813 for r in a.getElementsByTagName('route'):
814 lo = get_attr(r, 'lo')
815 hi = get_attr(r,'hi', '')
816 if self.net_type == 'tcp' and hi == '':
817 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
819 panic("no server for nid", lo)
822 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
823 except CommandError, e:
824 print "disconnect failed: ", self.name
828 lctl.del_route(self.net_type, self.nid, lo, hi)
829 except CommandError, e:
830 print "del_route failed: ", self.name
835 lctl.cleanup("RPCDEV", "RPCDEV_UUID")
836 except CommandError, e:
837 print "cleanup failed: ", self.name
841 lctl.disconnectAll(self.net_type)
842 except CommandError, e:
843 print "disconnectAll failed: ", self.name
846 if self.net_type == 'tcp':
847 # yikes, this ugly! need to save pid in /var/something
848 run("killall acceptor")
851 def __init__(self,dom_node):
852 Module.__init__(self, 'LDLM', dom_node)
853 self.add_module('lustre/ldlm', 'ldlm')
856 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
860 def __init__(self,dom_node):
861 Module.__init__(self, 'LOV', dom_node)
862 self.mds_uuid = get_first_ref(dom_node, 'mds')
863 mds= lookup(dom_node.parentNode, self.mds_uuid)
864 self.mds_name = getName(mds)
865 devs = dom_node.getElementsByTagName('devices')
868 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
869 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
870 self.pattern = get_attr_int(dev_node, 'pattern', 0)
871 self.devlist = get_all_refs(dev_node, 'osc')
872 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
873 self.add_module('lustre/mdc', 'mdc')
874 self.add_module('lustre/lov', 'lov')
877 for osc_uuid in self.devlist:
878 osc = lookup(self.dom_node.parentNode, osc_uuid)
883 panic('osc not found:', osc_uuid)
884 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
885 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
886 self.stripe_off, self.pattern, self.devlist, self.mds_name)
887 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
888 setup ="%s" % (mdc_uuid))
891 for osc_uuid in self.devlist:
892 osc = lookup(self.dom_node.parentNode, osc_uuid)
897 panic('osc not found:', osc_uuid)
899 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
902 def load_module(self):
903 for osc_uuid in self.devlist:
904 osc = lookup(self.dom_node.parentNode, osc_uuid)
910 panic('osc not found:', osc_uuid)
911 Module.load_module(self)
914 def cleanup_module(self):
915 Module.cleanup_module(self)
916 for osc_uuid in self.devlist:
917 osc = lookup(self.dom_node.parentNode, osc_uuid)
923 panic('osc not found:', osc_uuid)
925 class LOVConfig(Module):
926 def __init__(self,dom_node):
927 Module.__init__(self, 'LOVConfig', dom_node)
928 self.lov_uuid = get_first_ref(dom_node, 'lov')
929 l = lookup(dom_node.parentNode, self.lov_uuid)
934 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
935 lov.pattern, lov.devlist, lov.mds_name)
936 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
937 lov.stripe_sz, lov.stripe_off, lov.pattern,
938 string.join(lov.devlist))
946 def __init__(self,dom_node):
947 Module.__init__(self, 'MDS', dom_node)
948 self.devname, self.size = get_device(dom_node)
949 self.fstype = get_text(dom_node, 'fstype')
950 # FIXME: if fstype not set, then determine based on kernel version
951 self.format = get_text(dom_node, 'autoformat', "no")
952 if self.fstype == 'extN':
953 self.add_module('lustre/extN', 'extN')
954 self.add_module('lustre/mds', 'mds')
955 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
958 self.info(self.devname, self.fstype, self.format)
959 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
960 if not is_prepared('MDT_UUID'):
961 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
963 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
964 setup ="%s %s" %(blkdev, self.fstype))
966 if is_prepared('MDT_UUID'):
968 lctl.cleanup("MDT", "MDT_UUID")
969 except CommandError, e:
970 print "cleanup failed: ", self.name
974 clean_loop(self.devname)
976 # Very unusual case, as there is no MDC element in the XML anymore
977 # Builds itself from an MDS node
979 def __init__(self,dom_node):
980 self.mds = MDS(dom_node)
981 self.dom_node = dom_node
982 self.module_name = 'MDC'
983 self.kmodule_list = []
987 host = socket.gethostname()
988 self.name = 'MDC_%s_%s' % ( host, self.mds.name )
989 self.uuid = self.name + '_UUID'
991 self.lookup_server(self.mds.uuid)
992 self.add_module('lustre/mdc', 'mdc')
995 self.info(self.mds.uuid)
996 srv = self.get_server()
997 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
998 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
999 setup ="%s %s" %(self.mds.uuid, srv.uuid))
1002 def __init__(self, dom_node):
1003 Module.__init__(self, 'OBD', dom_node)
1004 self.obdtype = get_attr(dom_node, 'type')
1005 self.devname, self.size = get_device(dom_node)
1006 self.fstype = get_text(dom_node, 'fstype')
1007 # FIXME: if fstype not set, then determine based on kernel version
1008 self.format = get_text(dom_node, 'autoformat', 'yes')
1009 if self.fstype == 'extN':
1010 self.add_module('lustre/extN', 'extN')
1011 self.add_module('lustre/' + self.obdtype, self.obdtype)
1013 # need to check /proc/mounts and /etc/mtab before
1014 # formatting anything.
1015 # FIXME: check if device is already formatted.
1017 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1018 if self.obdtype == 'obdecho':
1021 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1022 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1023 setup ="%s %s" %(blkdev, self.fstype))
1025 Module.cleanup(self)
1026 if not self.obdtype == 'obdecho':
1027 clean_loop(self.devname)
1030 def __init__(self,dom_node):
1031 Module.__init__(self, 'OST', dom_node)
1032 self.obd_uuid = get_first_ref(dom_node, 'obd')
1033 self.add_module('lustre/ost', 'ost')
1036 self.info(self.obd_uuid)
1037 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1038 setup ="%s" % (self.obd_uuid))
1041 # virtual interface for OSC and LOV
1043 def __init__(self,dom_node):
1044 Module.__init__(self, 'VOSC', dom_node)
1045 if dom_node.nodeName == 'lov':
1046 self.osc = LOV(dom_node)
1048 self.osc = OSC(dom_node)
1053 def load_module(self):
1054 self.osc.load_module()
1055 def cleanup_module(self):
1056 self.osc.cleanup_module()
1060 def __init__(self,dom_node):
1061 Module.__init__(self, 'OSC', dom_node)
1062 self.obd_uuid = get_first_ref(dom_node, 'obd')
1063 self.ost_uuid = get_first_ref(dom_node, 'ost')
1064 self.lookup_server(self.ost_uuid)
1065 self.add_module('lustre/osc', 'osc')
1068 self.info(self.obd_uuid, self.ost_uuid)
1069 srv = self.get_server()
1071 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1075 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1077 panic ("no route to", srv.nid)
1079 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1080 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1083 srv = self.get_server()
1085 Module.cleanup(self)
1087 self.info(self.obd_uuid, self.ost_uuid)
1091 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1092 except CommandError, e:
1093 print "del_route failed: ", self.name
1096 Module.cleanup(self)
1099 class Mountpoint(Module):
1100 def __init__(self,dom_node):
1101 Module.__init__(self, 'MTPT', dom_node)
1102 self.path = get_text(dom_node, 'path')
1103 self.mds_uuid = get_first_ref(dom_node, 'mds')
1104 self.lov_uuid = get_first_ref(dom_node, 'osc')
1105 self.add_module('lustre/mdc', 'mdc')
1106 self.add_module('lustre/llite', 'llite')
1107 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1112 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1113 self.info(self.path, self.mds_uuid, self.lov_uuid)
1114 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1115 (self.lov_uuid, mdc_uuid, self.path)
1116 run("mkdir", self.path)
1119 panic("mount failed:", self.path)
1122 self.info(self.path, self.mds_uuid,self.lov_uuid)
1124 (rc, out) = run("umount -f", self.path)
1126 (rc, out) = run("umount", self.path)
1128 log("umount failed, cleanup will most likely not work.")
1129 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1131 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1133 def load_module(self):
1134 self.osc.load_module()
1135 Module.load_module(self)
1136 def cleanup_module(self):
1137 Module.cleanup_module(self)
1138 self.osc.cleanup_module()
1141 # ============================================================
1142 # XML processing and query
1143 # TODO: Change query funcs to use XPath, which is muc cleaner
1145 def get_device(obd):
1146 list = obd.getElementsByTagName('device')
1150 size = get_attr_int(dev, 'size', 0)
1151 return dev.firstChild.data, size
1154 # Get the text content from the first matching child
1155 # If there is no content (or it is all whitespace), return
1157 def get_text(dom_node, tag, default=""):
1158 list = dom_node.getElementsByTagName(tag)
1161 dom_node.normalize()
1162 if dom_node.firstChild:
1163 txt = string.strip(dom_node.firstChild.data)
1168 def get_text_int(dom_node, tag, default=0):
1169 list = dom_node.getElementsByTagName(tag)
1173 dom_node.normalize()
1174 if dom_node.firstChild:
1175 txt = string.strip(dom_node.firstChild.data)
1180 panic("text value is not integer:", txt)
1183 def get_attr(dom_node, attr, default=""):
1184 v = dom_node.getAttribute(attr)
1189 def get_attr_int(dom_node, attr, default=0):
1191 v = dom_node.getAttribute(attr)
1196 panic("attr value is not integer", v)
1199 def get_first_ref(dom_node, tag):
1200 """ Get the first uuidref of the type TAG. Used one only
1201 one is expected. Returns the uuid."""
1203 refname = '%s_ref' % tag
1204 list = dom_node.getElementsByTagName(refname)
1206 uuid = getRef(list[0])
1209 def get_all_refs(dom_node, tag):
1210 """ Get all the refs of type TAG. Returns list of uuids. """
1212 refname = '%s_ref' % tag
1213 list = dom_node.getElementsByTagName(refname)
1216 uuids.append(getRef(i))
1219 def get_ost_net(dom_node, uuid):
1220 ost = lookup(dom_node, uuid)
1221 uuid = get_first_ref(ost, 'network')
1224 return lookup(dom_node, uuid)
1226 def nid2server(dom_node, nid):
1227 netlist = dom_node.getElementsByTagName('network')
1228 for net_node in netlist:
1229 if get_text(net_node, 'server') == nid:
1230 return Network(net_node)
1233 def lookup(dom_node, uuid):
1234 for n in dom_node.childNodes:
1235 if n.nodeType == n.ELEMENT_NODE:
1236 if getUUID(n) == uuid:
1243 # Get name attribute of dom_node
1244 def getName(dom_node):
1245 return dom_node.getAttribute('name')
1247 def getRef(dom_node):
1248 return dom_node.getAttribute('uuidref')
1250 # Get name attribute of dom_node
1251 def getUUID(dom_node):
1252 return dom_node.getAttribute('uuid')
1254 # the tag name is the service type
1255 # fixme: this should do some checks to make sure the dom_node is a service
1256 def getServiceType(dom_node):
1257 return dom_node.nodeName
1260 # determine what "level" a particular node is at.
1261 # the order of iniitailization is based on level.
1262 def getServiceLevel(dom_node):
1263 type = getServiceType(dom_node)
1265 if type in ('network',):
1267 elif type in ('device', 'ldlm'):
1269 elif type in ('obd', 'mdd'):
1271 elif type in ('mds','ost'):
1273 elif type in ('mdc','osc'):
1275 elif type in ('lov', 'lovconfig'):
1277 elif type in ('mountpoint',):
1280 if ret < config.startlevel() or ret > config.endlevel():
1285 # return list of services in a profile. list is a list of tuples
1286 # [(level, dom_node),]
1287 def getServices(lustreNode, profileNode):
1289 for n in profileNode.childNodes:
1290 if n.nodeType == n.ELEMENT_NODE:
1291 servNode = lookup(lustreNode, getRef(n))
1294 panic('service not found: ' + getRef(n))
1295 level = getServiceLevel(servNode)
1297 list.append((level, servNode))
1301 def getByName(lustreNode, name, tag):
1302 ndList = lustreNode.getElementsByTagName(tag)
1304 if getName(nd) == name:
1309 ############################################################
1311 # FIXME: clean this mess up!
1314 def prepare_mdc(dom_node, mds_uuid):
1316 mds_node = lookup(dom_node, mds_uuid);
1318 panic("no mds:", mds_uuid)
1319 if saved_mdc.has_key(mds_uuid):
1320 return saved_mdc[mds_uuid]
1323 saved_mdc[mds_uuid] = mdc.uuid
1327 def cleanup_mdc(dom_node, mds_uuid):
1329 mds_node = lookup(dom_node, mds_uuid);
1331 panic("no mds:", mds_uuid)
1339 ############################################################
1340 # routing ("rooting")
1346 def init_node(dom_node):
1347 global local_node, router_flag
1348 netlist = dom_node.getElementsByTagName('network')
1349 for dom_net in netlist:
1350 type = get_attr(dom_net, 'type')
1351 gw = get_text(dom_net, 'server')
1352 local_node.append((type, gw))
1354 def node_needs_router():
1357 def get_routes(type, gw, dom_net):
1358 """ Return the routes as a list of tuples of the form:
1359 [(type, gw, lo, hi),]"""
1361 tbl = dom_net.getElementsByTagName('route_tbl')
1363 routes = t.getElementsByTagName('route')
1365 lo = get_attr(r, 'lo')
1366 hi = get_attr(r, 'hi', '')
1367 res.append((type, gw, lo, hi))
1371 def init_route_config(lustre):
1372 """ Scan the lustre config looking for routers. Build list of
1374 global routes, router_flag
1376 list = lustre.getElementsByTagName('node')
1378 if get_attr(node, 'router'):
1380 for (local_type, local_nid) in local_node:
1382 netlist = node.getElementsByTagName('network')
1383 for dom_net in netlist:
1384 if local_type == get_attr(dom_net, 'type'):
1385 gw = get_text(dom_net, 'server')
1389 for dom_net in netlist:
1390 if local_type != get_attr(dom_net, 'type'):
1391 for route in get_routes(local_type, gw, dom_net):
1392 routes.append(route)
1397 for iface in local_node:
1398 if net.net_type == iface[0]:
1402 def find_route(net):
1403 global local_node, routes
1404 frm_type = local_node[0][0]
1405 to_type = net.net_type
1407 debug ('looking for route to', to_type,to)
1416 ############################################################
1419 def startService(dom_node, module_flag):
1420 type = getServiceType(dom_node)
1421 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1422 # there must be a more dynamic way of doing this...
1428 elif type == 'lovconfig':
1429 n = LOVConfig(dom_node)
1430 elif type == 'network':
1431 n = Network(dom_node)
1442 elif type == 'mountpoint':
1443 n = Mountpoint(dom_node)
1445 panic ("unknown service type:", type)
1450 if config.cleanup():
1455 if config.nosetup():
1457 if config.cleanup():
1463 # Prepare the system to run lustre using a particular profile
1464 # in a the configuration.
1465 # * load & the modules
1466 # * setup networking for the current node
1467 # * make sure partitions are in place and prepared
1468 # * initialize devices with lctl
1469 # Levels is important, and needs to be enforced.
1470 def startProfile(lustreNode, profileNode, module_flag):
1472 panic("profile:", profile, "not found.")
1473 services = getServices(lustreNode, profileNode)
1474 if config.cleanup():
1477 startService(s[1], module_flag)
1482 def doHost(lustreNode, hosts):
1486 dom_node = getByName(lustreNode, h, 'node')
1491 print 'No host entry found.'
1494 if not get_attr(dom_node, 'router'):
1496 init_route_config(lustreNode)
1501 # Two step process: (1) load modules, (2) setup lustre
1502 # if not cleaning, load modules first.
1503 module_flag = not config.cleanup()
1504 reflist = dom_node.getElementsByTagName('profile')
1505 for profile in reflist:
1506 startProfile(lustreNode, profile, module_flag)
1508 if not config.cleanup():
1509 sys_set_debug_path()
1510 script = config.gdb_script()
1511 run(lctl.lctl, ' modules >', script)
1513 # dump /tmp/ogdb and sleep/pause here
1514 log ("The GDB module script is in", script)
1517 module_flag = not module_flag
1518 for profile in reflist:
1519 startProfile(lustreNode, profile, module_flag)
1521 ############################################################
1522 # Command line processing
1524 def parse_cmdline(argv):
1525 short_opts = "hdnvf"
1526 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1527 "portals=", "makeldiff", "cleanup", "noexec",
1528 "help", "node=", "nomod", "nosetup",
1529 "dump=", "force", "startlevel=", "endlevel="]
1533 opts, args = getopt.getopt(argv, short_opts, long_opts)
1534 except getopt.error:
1539 if o in ("-h", "--help"):
1541 if o in ("-d","--cleanup"):
1543 if o in ("-v", "--verbose"):
1545 if o in ("-n", "--noexec"):
1548 if o == "--portals":
1552 if o == "--reformat":
1560 if o == "--nosetup":
1564 if o in ("-f", "--force"):
1566 if o in ("--startlevel",):
1567 config.startlevel(a)
1568 if o in ("--endlevel",):
1577 s = urllib.urlopen(url)
1583 def setupModulePath(cmd):
1584 base = os.path.dirname(cmd)
1585 if os.access(base+"/Makefile", os.R_OK):
1586 config.src_dir(base + "/../../")
1588 def sys_set_debug_path():
1589 debug("debug path: ", config.debug_path())
1593 fp = open('/proc/sys/portals/debug_path', 'w')
1594 fp.write(config.debug_path())
1599 #/proc/sys/net/core/rmem_max
1600 #/proc/sys/net/core/wmem_max
1601 def sys_set_netmem_max(path, max):
1602 debug("setting", path, "to at least", max)
1610 fp = open(path, 'w')
1611 fp.write('%d\n' %(max))
1615 def sys_make_devices():
1616 if not os.access('/dev/portals', os.R_OK):
1617 run('mknod /dev/portals c 10 240')
1618 if not os.access('/dev/obd', os.R_OK):
1619 run('mknod /dev/obd c 10 241')
1622 # Add dir to the global PATH, if not already there.
1623 def add_to_path(new_dir):
1624 syspath = string.split(os.environ['PATH'], ':')
1625 if new_dir in syspath:
1627 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1630 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1631 # ensure basic elements are in the system path
1632 def sanitise_path():
1633 for dir in DEFAULT_PATH:
1636 # Initialize or shutdown lustre according to a configuration file
1637 # * prepare the system for lustre
1638 # * configure devices with lctl
1639 # Shutdown does steps in reverse
1642 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1643 host = socket.gethostname()
1647 args = parse_cmdline(sys.argv[1:])
1649 if not os.access(args[0], os.R_OK):
1650 print 'File not found or readable:', args[0]
1652 dom = xml.dom.minidom.parse(args[0])
1654 xmldata = fetch(config.url())
1655 dom = xml.dom.minidom.parseString(xmldata)
1661 node_list.append(config.node())
1664 node_list.append(host)
1665 node_list.append('localhost')
1666 debug("configuring for host: ", node_list)
1669 config._debug_path = config._debug_path + '-' + host
1670 config._gdb_script = config._gdb_script + '-' + host
1672 TCP_ACCEPTOR = find_prog('acceptor')
1673 if not TCP_ACCEPTOR:
1675 TCP_ACCEPTOR = 'acceptor'
1676 debug('! acceptor not found')
1678 panic('acceptor not found')
1680 lctl = LCTLInterface('lctl')
1682 setupModulePath(sys.argv[0])
1684 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1685 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1686 doHost(dom.documentElement, node_list)
1688 if __name__ == "__main__":
1691 except LconfError, e:
1693 except CommandError, e:
1697 if first_cleanup_error:
1698 sys.exit(first_cleanup_error)