3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import string, os, stat, popen2, socket, time
30 import xml.dom.minidom
35 DEFAULT_TCPBUF = 1048576
37 # Maximum number of devices to search for.
38 # (the /dev/loop* nodes need to be created beforehand)
39 MAX_LOOP_DEVICES = 256
41 first_cleanup_error = 0
42 def cleanup_error(rc):
43 global first_cleanup_error
44 if not first_cleanup_error:
45 first_cleanup_error = rc
49 print """usage: lconf config.xml
51 config.xml Lustre configuration in xml format.
52 --get <url> URL to fetch a config file
53 --node <nodename> Load config for <nodename>
54 -d | --cleanup Cleans up config. (Shutdown)
55 -v | --verbose Print system commands as they are run
56 -h | --help Print this help
57 --gdb Prints message after creating gdb module script
58 and sleeps for 5 seconds.
59 -n | --noexec Prints the commands and steps that will be run for a
60 config without executing them. This can used to check if a
61 config file is doing what it should be doing. (Implies -v)
62 --nomod Skip load/unload module step.
63 --nosetup Skip device setup/cleanup step.
64 --reformat Reformat all devices (without question)
65 --dump Dump the kernel debug log before portals is unloaded
68 --ldap server LDAP server with lustre config database
69 --makeldiff Translate xml source to LDIFF
70 This are perhaps not needed:
71 --lustre="src dir" Base directory of lustre sources. Used to search
73 --portals=src Portals source
77 # ============================================================
78 # Config parameters, encapsulated in a class
93 self._gdb_script = '/tmp/ogdb'
94 self._debug_path = '/tmp/lustre-log'
95 self._dump_file = None
98 def verbose(self, flag = None):
99 if flag: self._verbose = flag
102 def noexec(self, flag = None):
103 if flag: self._noexec = flag
106 def reformat(self, flag = None):
107 if flag: self._reformat = flag
108 return self._reformat
110 def cleanup(self, flag = None):
111 if flag: self._cleanup = flag
114 def gdb(self, flag = None):
115 if flag: self._gdb = flag
118 def nomod(self, flag = None):
119 if flag: self._nomod = flag
122 def nosetup(self, flag = None):
123 if flag: self._nosetup = flag
126 def node(self, val = None):
127 if val: self._node = val
130 def url(self, val = None):
131 if val: self._url = val
134 def gdb_script(self):
135 if os.path.isdir('/r'):
136 return '/r' + self._gdb_script
138 return self._gdb_script
140 def debug_path(self):
141 if os.path.isdir('/r'):
142 return '/r' + self._debug_path
144 return self._debug_path
146 def src_dir(self, val = None):
147 if val: self._src_dir = val
150 def dump_file(self, val = None):
151 if val: self._dump_file = val
152 return self._dump_file
156 # ============================================================
157 # debugging and error funcs
159 def fixme(msg = "this feature"):
160 raise LconfError, msg + ' not implmemented yet.'
163 msg = string.join(map(str,args))
164 if not config.noexec():
165 raise LconfError(msg)
170 msg = string.join(map(str,args))
175 print string.strip(s)
179 msg = string.join(map(str,args))
182 # ============================================================
183 # locally defined exceptions
184 class CommandError (exceptions.Exception):
185 def __init__(self, cmd_name, cmd_err, rc=None):
186 self.cmd_name = cmd_name
187 self.cmd_err = cmd_err
192 if type(self.cmd_err) == types.StringType:
194 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
196 print "! %s: %s" % (self.cmd_name, self.cmd_err)
197 elif type(self.cmd_err) == types.ListType:
199 print "! %s (error %d):" % (self.cmd_name, self.rc)
201 print "! %s:" % (self.cmd_name)
202 for s in self.cmd_err:
203 print "> %s" %(string.strip(s))
207 class LconfError (exceptions.Exception):
208 def __init__(self, args):
212 # ============================================================
213 # handle lctl interface
216 Manage communication with lctl
219 def __init__(self, cmd):
221 Initialize close by finding the lctl binary.
223 self.lctl = find_prog(cmd)
226 debug('! lctl not found')
229 raise CommandError('lctl', "unable to find lctl binary.")
234 the cmds are written to stdin of lctl
235 lctl doesn't return errors when run in script mode, so
237 should modify command line to accept multiple commands, or
238 create complex command line options
240 debug("+", self.lctl, cmds)
241 if config.noexec(): return (0, [])
242 p = popen2.Popen3(self.lctl, 1)
243 p.tochild.write(cmds + "\n")
245 out = p.fromchild.readlines()
246 err = p.childerr.readlines()
249 raise CommandError(self.lctl, err, ret)
253 def network(self, net, nid):
254 """ initialized network and add "self" """
255 # Idea: "mynid" could be used for all network types to add "self," and then
256 # this special case would be gone and the "self" hack would be hidden.
262 quit""" % (net, nid, nid)
271 # create a new connection
272 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
280 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
286 quit""" % (net, servuuid, nid, nid, port, )
290 # add a route to a range
291 def add_route(self, net, gw, lo, hi):
295 quit """ % (net, gw, lo, hi)
299 # add a route to a range
300 def del_route(self, net, gw, lo, hi):
307 # add a route to a host
308 def add_route_host(self, net, uuid, gw, tgt):
313 quit """ % (net, uuid, tgt, gw, tgt)
316 # add a route to a range
317 def del_route_host(self, net, uuid, gw, tgt):
322 quit """ % (net, uuid, tgt)
325 # disconnect one connection
326 def disconnect(self, net, nid, port, servuuid):
331 quit""" % (net, nid, servuuid)
335 def disconnectAll(self, net):
343 # create a new device with lctl
344 def newdev(self, attach, setup = ""):
349 quit""" % (attach, setup)
353 def cleanup(self, name, uuid):
362 def lovconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
366 lovconfig %s %d %d %d %s %s
367 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
371 def dump(self, dump_file):
374 quit""" % (dump_file)
377 # ============================================================
378 # Various system-level functions
379 # (ideally moved to their own module)
381 # Run a command and return the output and status.
382 # stderr is sent to /dev/null, could use popen3 to
383 # save it if necessary
385 cmd = string.join(map(str,args))
387 if config.noexec(): return (0, [])
388 f = os.popen(cmd + ' 2>&1')
397 # Run a command in the background.
398 def run_daemon(*args):
399 cmd = string.join(map(str,args))
401 if config.noexec(): return 0
402 f = os.popen(cmd + ' 2>&1')
410 # Determine full path to use for an external command
411 # searches dirname(argv[0]) first, then PATH
413 syspath = string.split(os.environ['PATH'], ':')
414 cmdpath = os.path.dirname(sys.argv[0])
415 syspath.insert(0, cmdpath);
416 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
418 prog = os.path.join(d,cmd)
419 if os.access(prog, os.X_OK):
423 # Recursively look for file starting at base dir
424 def do_find_file(base, mod):
425 fullname = os.path.join(base, mod)
426 if os.access(fullname, os.R_OK):
428 for d in os.listdir(base):
429 dir = os.path.join(base,d)
430 if os.path.isdir(dir):
431 module = do_find_file(dir, mod)
435 def find_module(src_dir, dev_dir, modname):
436 mod = '%s.o' % (modname)
437 module = src_dir +'/'+ dev_dir +'/'+ mod
439 if os.access(module, os.R_OK):
445 # is the path a block device?
452 return stat.S_ISBLK(s[stat.ST_MODE])
454 # build fs according to type
456 def mkfs(fstype, dev):
457 if(fstype in ('ext3', 'extN')):
458 mkfs = 'mkfs.ext2 -j -b 4096'
460 print 'unsupported fs type: ', fstype
461 if not is_block(dev):
465 (ret, out) = run (mkfs, force, dev)
467 panic("Unable to build fs:", dev)
468 # enable hash tree indexing on fs
470 htree = 'echo "feature FEATURE_C5" | debugfs -w'
471 (ret, out) = run (htree, dev)
473 panic("Unable to enable htree:", dev)
475 # some systems use /dev/loopN, some /dev/loop/N
479 if not os.access(loop + str(0), os.R_OK):
481 if not os.access(loop + str(0), os.R_OK):
482 panic ("can't access loop devices")
485 # find loop device assigned to thefile
488 for n in xrange(0, MAX_LOOP_DEVICES):
490 if os.access(dev, os.R_OK):
491 (stat, out) = run('losetup', dev)
492 if (out and stat == 0):
493 m = re.search(r'\((.*)\)', out[0])
494 if m and file == m.group(1):
500 # create file if necessary and assign the first free loop device
501 def init_loop(file, size, fstype):
502 dev = find_loop(file)
504 print 'WARNING file:', file, 'already mapped to', dev
506 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
507 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
509 # find next free loop
510 for n in xrange(0, MAX_LOOP_DEVICES):
512 if os.access(dev, os.R_OK):
513 (stat, out) = run('losetup', dev)
515 run('losetup', dev, file)
518 print "out of loop devices"
520 print "out of loop devices"
523 # undo loop assignment
524 def clean_loop(file):
525 dev = find_loop(file)
527 ret, out = run('losetup -d', dev)
529 log('unable to clean loop device:', dev, 'for file:', file)
532 # determine if dev is formatted as a <fstype> filesystem
533 def need_format(fstype, dev):
534 # FIXME don't know how to implement this
537 # initialize a block device if needed
538 def block_dev(dev, size, fstype, format):
539 if config.noexec(): return dev
540 if not is_block(dev):
541 dev = init_loop(dev, size, fstype)
542 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
546 # panic("device:", dev,
547 # "not prepared, and autoformat is not set.\n",
548 # "Rerun with --reformat option to format ALL filesystems")
553 """lookup IP address for an interface"""
554 rc, out = run("/sbin/ifconfig", iface)
557 addr = string.split(out[1])[1]
558 ip = string.split(addr, ':')[1]
561 def get_local_address(net_type, wildcard):
562 """Return the local address for the network type."""
564 if net_type == 'tcp':
566 iface, star = string.split(wildcard, ':')
567 local = if2addr(iface)
569 panic ("unable to determine ip for:", wildcard)
571 host = socket.gethostname()
572 local = socket.gethostbyname(host)
573 elif net_type == 'elan':
574 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
576 fp = open('/proc/elan/device0/position', 'r')
577 lines = fp.readlines()
586 elif net_type == 'gm':
587 fixme("automatic local address for GM")
592 # ============================================================
593 # Classes to prepare and cleanup the various objects
596 """ Base class for the rest of the modules. The default cleanup method is
597 defined here, as well as some utilitiy funcs.
599 def __init__(self, module_name, dom_node):
600 self.dom_node = dom_node
601 self.module_name = module_name
602 self.name = get_attr(dom_node, 'name')
603 self.uuid = get_attr(dom_node, 'uuid')
604 self.kmodule_list = []
608 def info(self, *args):
609 msg = string.join(map(str,args))
610 print self.module_name + ":", self.name, self.uuid, msg
613 def lookup_server(self, srv_uuid):
614 """ Lookup a server's network information """
615 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
617 panic ("Unable to find a server for:", srv_uuid)
618 self._server = Network(net)
620 def get_server(self):
624 """ default cleanup, used for most modules """
626 srv = self.get_server()
627 if srv and local_net(srv):
629 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
630 except CommandError, e:
631 log(self.module_name, "disconnect failed: ", self.name)
635 lctl.cleanup(self.name, self.uuid)
636 except CommandError, e:
637 log(self.module_name, "cleanup failed: ", self.name)
641 def add_module(self, dev_dir, modname):
642 """Append a module to list of modules to load."""
643 self.kmodule_list.append((dev_dir, modname))
645 def mod_loaded(self, modname):
646 """Check if a module is already loaded. Look in /proc/modules for it."""
647 fp = open('/proc/modules')
648 lines = fp.readlines()
650 # please forgive my tired fingers for this one
651 ret = filter(lambda word, mod=modname: word == mod,
652 map(lambda line: string.split(line)[0], lines))
655 def load_module(self):
656 """Load all the modules in the list in the order they appear."""
657 for dev_dir, mod in self.kmodule_list:
658 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
659 if self.mod_loaded(mod) and not config.noexec():
661 log ('loading module:', mod)
663 module = find_module(config.src_dir(),dev_dir, mod)
665 panic('module not found:', mod)
666 (rc, out) = run('/sbin/insmod', module)
668 raise CommandError('insmod', out, rc)
670 (rc, out) = run('/sbin/modprobe', mod)
672 raise CommandError('modprobe', out, rc)
674 def cleanup_module(self):
675 """Unload the modules in the list in reverse order."""
676 rev = self.kmodule_list
678 for dev_dir, mod in rev:
679 if not self.mod_loaded(mod):
682 if mod == 'portals' and config.dump_file():
683 lctl.dump(config.dump_file())
684 log('unloading module:', mod)
687 (rc, out) = run('/sbin/rmmod', mod)
689 log('! unable to unload module:', mod)
693 class Network(Module):
694 def __init__(self,dom_node):
695 Module.__init__(self, 'NETWORK', dom_node)
696 self.net_type = get_attr(dom_node,'type')
697 self.nid = get_text(dom_node, 'server', '*')
698 self.port = get_text_int(dom_node, 'port', 0)
699 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
700 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
702 self.nid = get_local_address(self.net_type, self.nid)
704 panic("unable to set nid for", self.net_type, self.nid)
705 debug("nid:", self.nid)
707 self.add_module('portals/linux/oslib/', 'portals')
708 if node_needs_router():
709 self.add_module('portals/linux/router', 'kptlrouter')
710 if self.net_type == 'tcp':
711 self.add_module('portals/linux/socknal', 'ksocknal')
712 if self.net_type == 'elan':
713 self.add_module('portals/linux/rqswnal', 'kqswnal')
714 if self.net_type == 'gm':
715 self.add_module('portals/linux/gmnal', 'kgmnal')
716 self.add_module('lustre/obdclass', 'obdclass')
717 self.add_module('lustre/ptlrpc', 'ptlrpc')
720 self.info(self.net_type, self.nid, self.port)
721 if self.net_type == 'tcp':
722 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
724 raise CommandError(TCP_ACCEPTOR, out, ret)
725 ret = self.dom_node.getElementsByTagName('route_tbl')
727 for r in a.getElementsByTagName('route'):
728 net_type = get_attr(r, 'type')
729 gw = get_attr(r, 'gw')
730 lo = get_attr(r, 'lo')
731 hi = get_attr(r,'hi', '')
732 lctl.add_route(net_type, gw, lo, hi)
733 if net_type == 'tcp' and net_type == self.net_type and hi == '':
734 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
736 panic("no server for nid", lo)
738 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
741 lctl.network(self.net_type, self.nid)
742 lctl.newdev(attach = "ptlrpc RPCDEV")
745 self.info(self.net_type, self.nid, self.port)
746 ret = self.dom_node.getElementsByTagName('route_tbl')
748 for r in a.getElementsByTagName('route'):
749 lo = get_attr(r, 'lo')
750 hi = get_attr(r,'hi', '')
751 if self.net_type == 'tcp' and hi == '':
752 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
754 panic("no server for nid", lo)
757 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
758 except CommandError, e:
759 print "disconnect failed: ", self.name
763 lctl.del_route(self.net_type, self.nid, lo, hi)
764 except CommandError, e:
765 print "del_route failed: ", self.name
770 lctl.cleanup("RPCDEV", "")
771 except CommandError, e:
772 print "cleanup failed: ", self.name
776 lctl.disconnectAll(self.net_type)
777 except CommandError, e:
778 print "disconnectAll failed: ", self.name
781 if self.net_type == 'tcp':
782 # yikes, this ugly! need to save pid in /var/something
783 run("killall acceptor")
786 def __init__(self,dom_node):
787 Module.__init__(self, 'LDLM', dom_node)
788 self.add_module('lustre/ldlm', 'ldlm')
791 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
795 def __init__(self,dom_node):
796 Module.__init__(self, 'LOV', dom_node)
797 self.mds_uuid = get_first_ref(dom_node, 'mds')
798 mds= lookup(dom_node.parentNode, self.mds_uuid)
799 self.mds_name = getName(mds)
800 devs = dom_node.getElementsByTagName('devices')
803 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
804 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
805 self.pattern = get_attr_int(dev_node, 'pattern', 0)
806 self.devlist = get_all_refs(dev_node, 'osc')
807 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
808 self.add_module('lustre/mdc', 'mdc')
809 self.add_module('lustre/lov', 'lov')
812 for osc_uuid in self.devlist:
813 osc = lookup(self.dom_node.parentNode, osc_uuid)
818 panic('osc not found:', osc_uuid)
819 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
820 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
821 self.stripe_off, self.pattern, self.devlist, self.mds_name)
822 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
823 setup ="%s" % (mdc_uuid))
826 for osc_uuid in self.devlist:
827 osc = lookup(self.dom_node.parentNode, osc_uuid)
832 panic('osc not found:', osc_uuid)
834 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
837 def load_module(self):
838 for osc_uuid in self.devlist:
839 osc = lookup(self.dom_node.parentNode, osc_uuid)
845 panic('osc not found:', osc_uuid)
846 Module.load_module(self)
849 def cleanup_module(self):
850 Module.cleanup_module(self)
851 for osc_uuid in self.devlist:
852 osc = lookup(self.dom_node.parentNode, osc_uuid)
858 panic('osc not found:', osc_uuid)
860 class LOVConfig(Module):
861 def __init__(self,dom_node):
862 Module.__init__(self, 'LOVConfig', dom_node)
863 self.lov_uuid = get_first_ref(dom_node, 'lov')
864 l = lookup(dom_node.parentNode, self.lov_uuid)
869 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
870 lov.pattern, lov.devlist, lov.mds_name)
871 lctl.lovconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
872 lov.stripe_sz, lov.stripe_off, lov.pattern,
873 string.join(lov.devlist))
881 def __init__(self,dom_node):
882 Module.__init__(self, 'MDS', dom_node)
883 self.devname, self.size = get_device(dom_node)
884 self.fstype = get_text(dom_node, 'fstype')
885 self.format = get_text(dom_node, 'autoformat', "no")
886 if self.fstype == 'extN':
887 self.add_module('lustre/extN', 'extN')
888 self.add_module('lustre/mds', 'mds')
889 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
892 self.info(self.devname, self.fstype, self.format)
893 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
894 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
895 setup ="%s %s" %(blkdev, self.fstype))
898 clean_loop(self.devname)
900 # Very unusual case, as there is no MDC element in the XML anymore
901 # Builds itself from an MDS node
903 def __init__(self,dom_node):
904 self.mds = MDS(dom_node)
905 self.dom_node = dom_node
906 self.module_name = 'MDC'
907 self.kmodule_list = []
911 host = socket.gethostname()
912 self.name = 'MDC_'+host
913 self.uuid = self.name+'_UUID'
915 self.lookup_server(self.mds.uuid)
916 self.add_module('lustre/mdc', 'mdc')
919 self.info(self.mds.uuid)
920 srv = self.get_server()
921 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
922 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
923 setup ="%s %s" %(self.mds.uuid, srv.uuid))
926 def __init__(self, dom_node):
927 Module.__init__(self, 'OBD', dom_node)
928 self.obdtype = get_attr(dom_node, 'type')
929 self.devname, self.size = get_device(dom_node)
930 self.fstype = get_text(dom_node, 'fstype')
931 self.format = get_text(dom_node, 'autoformat', 'yes')
932 if self.fstype == 'extN':
933 self.add_module('lustre/extN', 'extN')
934 self.add_module('lustre/' + self.obdtype, self.obdtype)
936 # need to check /proc/mounts and /etc/mtab before
937 # formatting anything.
938 # FIXME: check if device is already formatted.
940 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
941 if self.obdtype == 'obdecho':
944 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
945 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
946 setup ="%s %s" %(blkdev, self.fstype))
949 if not self.obdtype == 'obdecho':
950 clean_loop(self.devname)
953 def __init__(self,dom_node):
954 Module.__init__(self, 'OST', dom_node)
955 self.obd_uuid = get_first_ref(dom_node, 'obd')
956 self.add_module('lustre/ost', 'ost')
959 self.info(self.obd_uuid)
960 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
961 setup ="%s" % (self.obd_uuid))
964 # virtual interface for OSC and LOV
966 def __init__(self,dom_node):
967 Module.__init__(self, 'VOSC', dom_node)
968 if dom_node.nodeName == 'lov':
969 self.osc = LOV(dom_node)
971 self.osc = OSC(dom_node)
976 def load_module(self):
977 self.osc.load_module()
978 def cleanup_module(self):
979 self.osc.cleanup_module()
983 def __init__(self,dom_node):
984 Module.__init__(self, 'OSC', dom_node)
985 self.obd_uuid = get_first_ref(dom_node, 'obd')
986 self.ost_uuid = get_first_ref(dom_node, 'ost')
987 self.lookup_server(self.ost_uuid)
988 self.add_module('lustre/osc', 'osc')
991 self.info(self.obd_uuid, self.ost_uuid)
992 srv = self.get_server()
994 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
998 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1000 panic ("no route to", srv.nid)
1002 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1003 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1006 srv = self.get_server()
1008 Module.cleanup(self)
1010 self.info(self.obd_uuid, self.ost_uuid)
1014 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1015 except CommandError, e:
1016 print "del_route failed: ", self.name
1019 Module.cleanup(self)
1022 class Mountpoint(Module):
1023 def __init__(self,dom_node):
1024 Module.__init__(self, 'MTPT', dom_node)
1025 self.path = get_text(dom_node, 'path')
1026 self.mds_uuid = get_first_ref(dom_node, 'mds')
1027 self.lov_uuid = get_first_ref(dom_node, 'osc')
1028 self.add_module('lustre/mdc', 'mdc')
1029 self.add_module('lustre/llite', 'llite')
1030 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1035 mdc_uuid = prepare_mdc(self.dom_node.parentNode, self.mds_uuid)
1037 self.info(self.path, self.mds_uuid,self.lov_uuid)
1038 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1039 (self.lov_uuid, mdc_uuid, self.path)
1040 run("mkdir", self.path)
1043 panic("mount failed:", self.path)
1046 self.info(self.path, self.mds_uuid,self.lov_uuid)
1047 (rc, out) = run("umount", self.path)
1049 log("umount failed, cleanup will most likely not work.")
1050 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1052 cleanup_mdc(self.dom_node.parentNode, self.mds_uuid)
1054 def load_module(self):
1055 self.osc.load_module()
1056 Module.load_module(self)
1057 def cleanup_module(self):
1058 Module.cleanup_module(self)
1059 self.osc.cleanup_module()
1062 # ============================================================
1063 # XML processing and query
1064 # TODO: Change query funcs to use XPath, which is muc cleaner
1066 def get_device(obd):
1067 list = obd.getElementsByTagName('device')
1071 size = get_attr_int(dev, 'size', 0)
1072 return dev.firstChild.data, size
1075 # Get the text content from the first matching child
1076 # If there is no content (or it is all whitespace), return
1078 def get_text(dom_node, tag, default=""):
1079 list = dom_node.getElementsByTagName(tag)
1082 dom_node.normalize()
1083 if dom_node.firstChild:
1084 txt = string.strip(dom_node.firstChild.data)
1089 def get_text_int(dom_node, tag, default=0):
1090 list = dom_node.getElementsByTagName(tag)
1094 dom_node.normalize()
1095 if dom_node.firstChild:
1096 txt = string.strip(dom_node.firstChild.data)
1101 panic("text value is not integer:", txt)
1104 def get_attr(dom_node, attr, default=""):
1105 v = dom_node.getAttribute(attr)
1110 def get_attr_int(dom_node, attr, default=0):
1112 v = dom_node.getAttribute(attr)
1117 panic("attr value is not integer", v)
1120 def get_first_ref(dom_node, tag):
1121 """ Get the first uuidref of the type TAG. Used one only
1122 one is expected. Returns the uuid."""
1124 refname = '%s_ref' % tag
1125 list = dom_node.getElementsByTagName(refname)
1127 uuid = getRef(list[0])
1130 def get_all_refs(dom_node, tag):
1131 """ Get all the refs of type TAG. Returns list of uuids. """
1133 refname = '%s_ref' % tag
1134 list = dom_node.getElementsByTagName(refname)
1137 uuids.append(getRef(i))
1140 def get_ost_net(dom_node, uuid):
1141 ost = lookup(dom_node, uuid)
1142 uuid = get_first_ref(ost, 'network')
1145 return lookup(dom_node, uuid)
1147 def nid2server(dom_node, nid):
1148 netlist = dom_node.getElementsByTagName('network')
1149 for net_node in netlist:
1150 if get_text(net_node, 'server') == nid:
1151 return Network(net_node)
1154 def lookup(dom_node, uuid):
1155 for n in dom_node.childNodes:
1156 if n.nodeType == n.ELEMENT_NODE:
1157 if getUUID(n) == uuid:
1164 # Get name attribute of dom_node
1165 def getName(dom_node):
1166 return dom_node.getAttribute('name')
1168 def getRef(dom_node):
1169 return dom_node.getAttribute('uuidref')
1171 # Get name attribute of dom_node
1172 def getUUID(dom_node):
1173 return dom_node.getAttribute('uuid')
1175 # the tag name is the service type
1176 # fixme: this should do some checks to make sure the dom_node is a service
1177 def getServiceType(dom_node):
1178 return dom_node.nodeName
1181 # determine what "level" a particular node is at.
1182 # the order of iniitailization is based on level.
1183 def getServiceLevel(dom_node):
1184 type = getServiceType(dom_node)
1185 if type in ('network',):
1187 elif type in ('device', 'ldlm'):
1189 elif type in ('obd', 'mdd'):
1191 elif type in ('mds','ost'):
1193 elif type in ('mdc','osc'):
1195 elif type in ('lov', 'lovconfig'):
1197 elif type in ('mountpoint',):
1202 # return list of services in a profile. list is a list of tuples
1203 # [(level, dom_node),]
1204 def getServices(lustreNode, profileNode):
1206 for n in profileNode.childNodes:
1207 if n.nodeType == n.ELEMENT_NODE:
1208 servNode = lookup(lustreNode, getRef(n))
1211 panic('service not found: ' + getRef(n))
1212 level = getServiceLevel(servNode)
1213 list.append((level, servNode))
1217 def getByName(lustreNode, name, tag):
1218 ndList = lustreNode.getElementsByTagName(tag)
1220 if getName(nd) == name:
1225 ############################################################
1227 # FIXME: clean this mess up!
1230 def prepare_mdc(dom_node, mds_uuid):
1232 mds_node = lookup(dom_node, mds_uuid);
1234 panic("no mds:", mds_uuid)
1243 def cleanup_mdc(dom_node, mds_uuid):
1245 mds_node = lookup(dom_node, mds_uuid);
1247 panic("no mds:", mds_uuid)
1255 ############################################################
1256 # routing ("rooting")
1262 def init_node(dom_node):
1263 global local_node, router_flag
1264 netlist = dom_node.getElementsByTagName('network')
1265 for dom_net in netlist:
1266 type = get_attr(dom_net, 'type')
1267 gw = get_text(dom_net, 'server')
1268 local_node.append((type, gw))
1270 def node_needs_router():
1273 def get_routes(type, gw, dom_net):
1274 """ Return the routes as a list of tuples of the form:
1275 [(type, gw, lo, hi),]"""
1277 tbl = dom_net.getElementsByTagName('route_tbl')
1279 routes = t.getElementsByTagName('route')
1281 lo = get_attr(r, 'lo')
1282 hi = get_attr(r, 'hi', '')
1283 res.append((type, gw, lo, hi))
1287 def init_route_config(lustre):
1288 """ Scan the lustre config looking for routers. Build list of
1290 global routes, router_flag
1292 list = lustre.getElementsByTagName('node')
1294 if get_attr(node, 'router'):
1296 for (local_type, local_nid) in local_node:
1298 netlist = node.getElementsByTagName('network')
1299 for dom_net in netlist:
1300 if local_type == get_attr(dom_net, 'type'):
1301 gw = get_text(dom_net, 'server')
1305 for dom_net in netlist:
1306 if local_type != get_attr(dom_net, 'type'):
1307 for route in get_routes(local_type, gw, dom_net):
1308 routes.append(route)
1313 for iface in local_node:
1314 if net.net_type == iface[0]:
1318 def find_route(net):
1319 global local_node, routes
1320 frm_type = local_node[0][0]
1321 to_type = net.net_type
1323 debug ('looking for route to', to_type,to)
1332 ############################################################
1335 def startService(dom_node, module_flag):
1336 type = getServiceType(dom_node)
1337 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1338 # there must be a more dynamic way of doing this...
1344 elif type == 'lovconfig':
1345 n = LOVConfig(dom_node)
1346 elif type == 'network':
1347 n = Network(dom_node)
1358 elif type == 'mountpoint':
1359 n = Mountpoint(dom_node)
1361 panic ("unknown service type:", type)
1366 if config.cleanup():
1371 if config.nosetup():
1373 if config.cleanup():
1379 # Prepare the system to run lustre using a particular profile
1380 # in a the configuration.
1381 # * load & the modules
1382 # * setup networking for the current node
1383 # * make sure partitions are in place and prepared
1384 # * initialize devices with lctl
1385 # Levels is important, and needs to be enforced.
1386 def startProfile(lustreNode, profileNode, module_flag):
1388 panic("profile:", profile, "not found.")
1389 services = getServices(lustreNode, profileNode)
1390 if config.cleanup():
1393 startService(s[1], module_flag)
1398 def doHost(lustreNode, hosts):
1402 dom_node = getByName(lustreNode, h, 'node')
1407 print 'No host entry found.'
1410 if not get_attr(dom_node, 'router'):
1412 init_route_config(lustreNode)
1417 # Two step process: (1) load modules, (2) setup lustre
1418 # if not cleaning, load modules first.
1419 module_flag = not config.cleanup()
1420 reflist = dom_node.getElementsByTagName('profile')
1421 for profile in reflist:
1422 startProfile(lustreNode, profile, module_flag)
1424 if not config.cleanup():
1425 sys_set_debug_path()
1426 script = config.gdb_script()
1427 run(lctl.lctl, ' modules >', script)
1429 # dump /tmp/ogdb and sleep/pause here
1430 log ("The GDB module script is in", script)
1433 module_flag = not module_flag
1434 for profile in reflist:
1435 startProfile(lustreNode, profile, module_flag)
1437 ############################################################
1438 # Command line processing
1440 def parse_cmdline(argv):
1442 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1443 "portals=", "makeldiff", "cleanup", "noexec",
1444 "help", "node=", "nomod", "nosetup",
1449 opts, args = getopt.getopt(argv, short_opts, long_opts)
1450 except getopt.error:
1455 if o in ("-h", "--help"):
1457 if o in ("-d","--cleanup"):
1459 if o in ("-v", "--verbose"):
1461 if o in ("-n", "--noexec"):
1464 if o == "--portals":
1468 if o == "--reformat":
1476 if o == "--nosetup":
1486 s = urllib.urlopen(url)
1492 def setupModulePath(cmd):
1493 base = os.path.dirname(cmd)
1494 if os.access(base+"/Makefile", os.R_OK):
1495 config.src_dir(base + "/../../")
1497 def sys_set_debug_path():
1498 debug("debug path: ", config.debug_path())
1502 fp = open('/proc/sys/portals/debug_path', 'w')
1503 fp.write(config.debug_path())
1508 #/proc/sys/net/core/rmem_max
1509 #/proc/sys/net/core/wmem_max
1510 def sys_set_netmem_max(path, max):
1511 debug("setting", path, "to at least", max)
1519 fp = open(path, 'w')
1520 fp.write('%d\n' %(max))
1524 def sys_make_devices():
1525 if not os.access('/dev/portals', os.R_OK):
1526 run('mknod /dev/portals c 10 240')
1527 if not os.access('/dev/obd', os.R_OK):
1528 run('mknod /dev/obd c 10 241')
1530 # Initialize or shutdown lustre according to a configuration file
1531 # * prepare the system for lustre
1532 # * configure devices with lctl
1533 # Shutdown does steps in reverse
1536 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1537 host = socket.gethostname()
1539 args = parse_cmdline(sys.argv[1:])
1541 if not os.access(args[0], os.R_OK | os.W_OK):
1542 print 'File not found:', args[0]
1544 dom = xml.dom.minidom.parse(args[0])
1546 xmldata = fetch(config.url())
1547 dom = xml.dom.minidom.parseString(xmldata)
1553 node_list.append(config.node())
1556 node_list.append(host)
1557 node_list.append('localhost')
1558 debug("configuring for host: ", node_list)
1561 config._debug_path = config._debug_path + '-' + host
1562 config._gdb_script = config._gdb_script + '-' + host
1564 TCP_ACCEPTOR = find_prog('acceptor')
1565 if not TCP_ACCEPTOR:
1567 TCP_ACCEPTOR = 'acceptor'
1568 debug('! acceptor not found')
1570 panic('acceptor not found')
1572 lctl = LCTLInterface('lctl')
1574 setupModulePath(sys.argv[0])
1576 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1577 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1578 doHost(dom.documentElement, node_list)
1580 if __name__ == "__main__":
1583 except LconfError, e:
1585 except CommandError, e:
1589 if first_cleanup_error:
1590 sys.exit(first_cleanup_error)