3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # : Ravindranadh Chowdary Sahukara <s-ravindranadh_chowdary@hp.com>
7 # This file is part of Lustre, http://www.lustre.org.
9 # Lustre is free software; you can redistribute it and/or
10 # modify it under the terms of version 2 of the GNU General Public
11 # License as published by the Free Software Foundation.
13 # Lustre is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU General Public License for more details.
18 # You should have received a copy of the GNU General Public License
19 # along with Lustre; if not, write to the Free Software
20 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 # lconf - lustre configuration tool
24 # lconf is the main driver script for starting and stopping
25 # lustre filesystem services.
27 # Based in part on the XML obdctl modifications done by Brian Behlendorf
30 import string, os, stat, popen2, socket, time
32 import xml.dom.minidom
37 DEFAULT_TCPBUF = 1048576
39 # Maximum number of devices to search for.
40 # (the /dev/loop* nodes need to be created beforehand)
42 MAX_LOOP_DEVICES = 256
44 first_cleanup_error = 0
45 def cleanup_error(rc):
46 global first_cleanup_error
47 if not first_cleanup_error:
48 first_cleanup_error = rc
51 print """usage: lconf config.xml
53 config.xml Lustre configuration in xml format.
54 --get <url> URL to fetch a config file
55 --node <nodename> Load config for <nodename>
56 -d | --cleanup Cleans up config. (Shutdown)
57 -f | --force Forced unmounting and/or obd detach during cleanup
58 -v | --verbose Print system commands as they are run
59 -h | --help Print this help
60 --gdb Prints message after creating gdb module script
61 and sleeps for 5 seconds.
62 -n | --noexec Prints the commands and steps that will be run for a
63 config without executing them. This can used to check if a
64 config file is doing what it should be doing. (Implies -v)
65 --nomod Skip load/unload module step.
66 --nosetup Skip device setup/cleanup step.
67 --reformat Reformat all devices (without question)
68 --dump <file> Dump the kernel debug log before portals is unloaded
69 --start [client|MDS|OST] Start the client|MDS|OST services (bug is there need to fixit)
70 --stop [client|MDS|OST] Stop the client|MDS|OST services (bug is there need to fix it)
71 --startlevel <num> Specify the level of services to start with (default 0)
72 --endlevel <num> Specify the level of services to end with (default 100)
73 Levels are aproximatly like:
83 --ldap server LDAP server with lustre config database
84 --makeldiff Translate xml source to LDIFF
85 This are perhaps not needed:
86 --lustre="src dir" Base directory of lustre sources. Used to search
88 --portals=src Portals source
92 # ============================================================
93 # Config parameters, encapsulated in a class
111 self._gdb_script = '/tmp/ogdb'
112 self._debug_path = '/tmp/lustre-log'
113 self._dump_file = None
115 self._start_level = 0
116 self._end_level = 100
118 def verbose(self, flag = None):
119 if flag: self._verbose = flag
122 def noexec(self, flag = None):
123 if flag: self._noexec = flag
126 def reformat(self, flag = None):
127 if flag: self._reformat = flag
128 return self._reformat
130 def cleanup(self, flag = None):
131 if flag: self._cleanup = flag
134 def gdb(self, flag = None):
135 if flag: self._gdb = flag
138 def nomod(self, flag = None):
139 if flag: self._nomod = flag
142 def nosetup(self, flag = None):
143 if flag: self._nosetup = flag
146 def start(self, flag = None):
147 if flag: self._start= flag
150 def stop(self, flag = None):
151 if flag: self._stop= flag
154 def force(self, flag = None):
155 if flag: self._force = flag
158 def node(self, val = None):
159 if val: self._node = val
162 def url(self, val = None):
163 if val: self._url = val
166 def gdb_script(self):
167 if os.path.isdir('/r'):
168 return '/r' + self._gdb_script
170 return self._gdb_script
172 def debug_path(self):
173 if os.path.isdir('/r'):
174 return '/r' + self._debug_path
176 return self._debug_path
178 def src_dir(self, val = None):
179 if val: self._src_dir = val
182 def dump_file(self, val = None):
183 if val: self._dump_file = val
184 return self._dump_file
186 def startlevel(self, val = None):
187 if val: self._start_level = int(val)
188 return self._start_level
190 def endlevel(self, val = None):
191 if val: self._end_level = int(val)
192 return self._end_level
196 # ============================================================
197 # debugging and error funcs
199 def fixme(msg = "this feature"):
200 raise LconfError, msg + ' not implmemented yet.'
203 msg = string.join(map(str,args))
204 if not config.noexec():
205 raise LconfError(msg)
210 msg = string.join(map(str,args))
215 print string.strip(s)
219 msg = string.join(map(str,args))
222 # ============================================================
223 # locally defined exceptions
224 class CommandError (exceptions.Exception):
225 def __init__(self, cmd_name, cmd_err, rc=None):
226 self.cmd_name = cmd_name
227 self.cmd_err = cmd_err
232 if type(self.cmd_err) == types.StringType:
234 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
236 print "! %s: %s" % (self.cmd_name, self.cmd_err)
237 elif type(self.cmd_err) == types.ListType:
239 print "! %s (error %d):" % (self.cmd_name, self.rc)
241 print "! %s:" % (self.cmd_name)
242 for s in self.cmd_err:
243 print "> %s" %(string.strip(s))
247 class LconfError (exceptions.Exception):
248 def __init__(self, args):
252 # ============================================================
253 # handle lctl interface
256 Manage communication with lctl
259 def __init__(self, cmd):
261 Initialize close by finding the lctl binary.
263 self.lctl = find_prog(cmd)
266 debug('! lctl not found')
269 raise CommandError('lctl', "unable to find lctl binary.")
274 the cmds are written to stdin of lctl
275 lctl doesn't return errors when run in script mode, so
277 should modify command line to accept multiple commands, or
278 create complex command line options
280 debug("+", self.lctl, cmds)
281 if config.noexec(): return (0, [])
282 p = popen2.Popen3(self.lctl, 1)
283 p.tochild.write(cmds + "\n")
285 out = p.fromchild.readlines()
286 err = p.childerr.readlines()
288 if os.WIFEXITED(ret):
289 rc = os.WEXITSTATUS(ret)
293 raise CommandError(self.lctl, err, rc)
297 def network(self, net, nid):
298 """ initialized network and add "self" """
299 # Idea: "mynid" could be used for all network types to add "self," and then
300 # this special case would be gone and the "self" hack would be hidden.
306 quit""" % (net, nid, nid)
315 # create a new connection
316 def connect(self, net, nid, port, servuuid, send_mem, recv_mem):
324 quit""" % (net, servuuid, nid, send_mem, recv_mem, nid, port, )
330 quit""" % (net, servuuid, nid, nid, port, )
334 # add a route to a range
335 def add_route(self, net, gw, lo, hi):
339 quit """ % (net, gw, lo, hi)
344 def del_route(self, net, gw, lo, hi):
352 # add a route to a host
353 def add_route_host(self, net, uuid, gw, tgt):
358 quit """ % (net, uuid, tgt, gw, tgt)
361 # add a route to a range
362 def del_route_host(self, net, uuid, gw, tgt):
368 quit """ % (net, uuid, tgt)
371 # disconnect one connection
372 def disconnect(self, net, nid, port, servuuid):
378 quit""" % (net, nid, servuuid)
382 def disconnectAll(self, net):
391 # create a new device with lctl
392 def newdev(self, attach, setup = ""):
397 quit""" % (attach, setup)
401 def cleanup(self, name, uuid):
407 quit""" % (name, ('', 'force')[config.force()])
411 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist):
415 lov_setconfig %s %d %d %d %s %s
416 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
420 def dump(self, dump_file):
423 quit""" % (dump_file)
426 # ============================================================
427 # Various system-level functions
428 # (ideally moved to their own module)
430 # Run a command and return the output and status.
431 # stderr is sent to /dev/null, could use popen3 to
432 # save it if necessary
434 cmd = string.join(map(str,args))
436 if config.noexec(): return (0, [])
437 f = os.popen(cmd + ' 2>&1')
446 # Run a command in the background.
447 def run_daemon(*args):
448 cmd = string.join(map(str,args))
450 if config.noexec(): return 0
451 f = os.popen(cmd + ' 2>&1')
459 # Determine full path to use for an external command
460 # searches dirname(argv[0]) first, then PATH
462 syspath = string.split(os.environ['PATH'], ':')
463 cmdpath = os.path.dirname(sys.argv[0])
464 syspath.insert(0, cmdpath);
465 syspath.insert(0, os.path.join(cmdpath, '../../portals/linux/utils/'))
467 prog = os.path.join(d,cmd)
468 if os.access(prog, os.X_OK):
472 # Recursively look for file starting at base dir
473 def do_find_file(base, mod):
474 fullname = os.path.join(base, mod)
475 if os.access(fullname, os.R_OK):
477 for d in os.listdir(base):
478 dir = os.path.join(base,d)
479 if os.path.isdir(dir):
480 module = do_find_file(dir, mod)
484 def find_module(src_dir, dev_dir, modname):
485 mod = '%s.o' % (modname)
486 module = src_dir +'/'+ dev_dir +'/'+ mod
488 if os.access(module, os.R_OK):
494 # is the path a block device?
501 return stat.S_ISBLK(s[stat.ST_MODE])
503 # build fs according to type
505 def mkfs(fstype, dev):
506 if(fstype in ('ext3', 'extN')):
507 mkfs = 'mkfs.ext2 -j -b 4096'
509 print 'unsupported fs type: ', fstype
510 if not is_block(dev):
514 (ret, out) = run (mkfs, force, dev)
516 panic("Unable to build fs:", dev)
517 # enable hash tree indexing on fs
519 htree = 'echo "feature FEATURE_C5" | debugfs -w'
520 (ret, out) = run (htree, dev)
522 panic("Unable to enable htree:", dev)
524 # some systems use /dev/loopN, some /dev/loop/N
528 if not os.access(loop + str(0), os.R_OK):
530 if not os.access(loop + str(0), os.R_OK):
531 panic ("can't access loop devices")
534 # find loop device assigned to thefile
537 for n in xrange(0, MAX_LOOP_DEVICES):
539 if os.access(dev, os.R_OK):
540 (stat, out) = run('losetup', dev)
541 if (out and stat == 0):
542 m = re.search(r'\((.*)\)', out[0])
543 if m and file == m.group(1):
549 # create file if necessary and assign the first free loop device
550 def init_loop(file, size, fstype):
551 dev = find_loop(file)
553 print 'WARNING file:', file, 'already mapped to', dev
555 if config.reformat() or not os.access(file, os.R_OK | os.W_OK):
556 run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size, file))
558 # find next free loop
559 for n in xrange(0, MAX_LOOP_DEVICES):
561 if os.access(dev, os.R_OK):
562 (stat, out) = run('losetup', dev)
564 run('losetup', dev, file)
567 print "out of loop devices"
569 print "out of loop devices"
572 # undo loop assignment
573 def clean_loop(file):
574 dev = find_loop(file)
576 ret, out = run('losetup -d', dev)
578 log('unable to clean loop device:', dev, 'for file:', file)
581 # determine if dev is formatted as a <fstype> filesystem
582 def need_format(fstype, dev):
583 # FIXME don't know how to implement this
586 # initialize a block device if needed
587 def block_dev(dev, size, fstype, format):
588 if config.noexec(): return dev
589 if not is_block(dev):
590 dev = init_loop(dev, size, fstype)
591 if config.reformat() or (need_format(fstype, dev) and format == 'yes'):
595 # panic("device:", dev,
596 # "not prepared, and autoformat is not set.\n",
597 # "Rerun with --reformat option to format ALL filesystems")
602 """lookup IP address for an interface"""
603 rc, out = run("/sbin/ifconfig", iface)
606 addr = string.split(out[1])[1]
607 ip = string.split(addr, ':')[1]
611 def get_local_address(net_type, wildcard):
612 """Return the local address for the network type."""
614 if net_type == 'tcp':
616 iface, star = string.split(wildcard, ':')
617 local = if2addr(iface)
619 panic ("unable to determine ip for:", wildcard)
621 host = socket.gethostname()
622 local = socket.gethostbyname(host)
623 elif net_type == 'elan':
624 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
626 fp = open('/proc/elan/device0/position', 'r')
627 lines = fp.readlines()
636 elif net_type == 'gm':
637 fixme("automatic local address for GM")
642 # ============================================================
643 # Classes to prepare and cleanup the various objects
646 """ Base class for the rest of the modules. The default cleanup method is
647 defined here, as well as some utilitiy funcs.
649 def __init__(self, module_name, dom_node):
650 self.dom_node = dom_node
651 self.module_name = module_name
652 self.name = get_attr(dom_node, 'name')
653 self.uuid = get_attr(dom_node, 'uuid')
654 self.kmodule_list = []
658 def info(self, *args):
659 msg = string.join(map(str,args))
660 print self.module_name + ":", self.name, self.uuid, msg
662 def lookup_server(self, srv_uuid):
663 """ Lookup a server's network information """
664 net = get_ost_net(self.dom_node.parentNode, srv_uuid)
666 panic ("Unable to find a server for:", srv_uuid)
667 self._server = Network(net)
669 def get_server(self):
673 """ default cleanup, used for most modules """
675 srv = self.get_server()
676 if srv and local_net(srv):
678 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
679 except CommandError, e:
680 log(self.module_name, "disconnect failed: ", self.name)
684 lctl.cleanup(self.name, self.uuid)
685 except CommandError, e:
686 log(self.module_name, "cleanup failed: ", self.name)
690 def add_module(self, dev_dir, modname):
691 """Append a module to list of modules to load."""
692 self.kmodule_list.append((dev_dir, modname))
694 def mod_loaded(self, modname):
695 """Check if a module is already loaded. Look in /proc/modules for it."""
696 fp = open('/proc/modules')
697 lines = fp.readlines()
699 # please forgive my tired fingers for this one
700 ret = filter(lambda word, mod=modname: word == mod,
701 map(lambda line: string.split(line)[0], lines))
704 def load_module(self):
705 """Load all the modules in the list in the order they appear."""
706 for dev_dir, mod in self.kmodule_list:
707 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
708 if self.mod_loaded(mod) and not config.noexec():
710 log ('loading module:', mod)
712 module = find_module(config.src_dir(),dev_dir, mod)
714 panic('module not found:', mod)
715 (rc, out) = run('/sbin/insmod', module)
717 raise CommandError('insmod', out, rc)
719 (rc, out) = run('/sbin/modprobe', mod)
721 raise CommandError('modprobe', out, rc)
723 def cleanup_module(self):
724 """Unload the modules in the list in reverse order."""
725 rev = self.kmodule_list
727 for dev_dir, mod in rev:
728 if not self.mod_loaded(mod):
731 if mod == 'portals' and config.dump_file():
732 lctl.dump(config.dump_file())
733 log('unloading module:', mod)
736 (rc, out) = run('/sbin/rmmod', mod)
738 log('! unable to unload module:', mod)
742 class Network(Module):
743 def __init__(self,dom_node):
744 Module.__init__(self, 'NETWORK', dom_node)
745 self.net_type = get_attr(dom_node,'type')
746 self.nid = get_text(dom_node, 'server', '*')
747 self.port = get_text_int(dom_node, 'port', 0)
748 self.send_mem = get_text_int(dom_node, 'send_mem', DEFAULT_TCPBUF)
749 self.recv_mem = get_text_int(dom_node, 'recv_mem', DEFAULT_TCPBUF)
751 self.nid = get_local_address(self.net_type, self.nid)
753 panic("unable to set nid for", self.net_type, self.nid)
754 debug("nid:", self.nid)
756 self.add_module('portals/linux/oslib/', 'portals')
757 if node_needs_router():
758 self.add_module('portals/linux/router', 'kptlrouter')
759 if self.net_type == 'tcp':
760 self.add_module('portals/linux/socknal', 'ksocknal')
761 if self.net_type == 'elan':
762 self.add_module('portals/linux/rqswnal', 'kqswnal')
763 if self.net_type == 'gm':
764 self.add_module('portals/linux/gmnal', 'kgmnal')
765 self.add_module('lustre/obdclass', 'obdclass')
766 self.add_module('lustre/ptlrpc', 'ptlrpc')
769 self.info(self.net_type, self.nid, self.port)
770 if self.net_type == 'tcp':
771 ret, out = run(TCP_ACCEPTOR, '-s', self.send_mem, '-r', self.recv_mem, self.port)
773 raise CommandError(TCP_ACCEPTOR, out, ret)
775 ret = self.dom_node.getElementsByTagName('route_tbl')
777 for r in a.getElementsByTagName('route'):
778 net_type = get_attr(r, 'type')
779 gw = get_attr(r, 'gw')
780 lo = get_attr(r, 'lo')
781 hi = get_attr(r,'hi', '')
782 lctl.add_route(net_type, gw, lo, hi)
783 if net_type == 'tcp' and net_type == self.net_type and hi == '':
784 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
786 panic("no server for nid", lo)
788 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
791 lctl.network(self.net_type, self.nid)
792 lctl.newdev(attach = "ptlrpc RPCDEV")
797 self.info(self.net_type, self.nid, self.port)
798 ret = self.dom_node.getElementsByTagName('route_tbl')
800 for r in a.getElementsByTagName('route'):
801 lo = get_attr(r, 'lo')
802 hi = get_attr(r,'hi', '')
803 if self.net_type == 'tcp' and hi == '':
804 srv = nid2server(self.dom_node.parentNode.parentNode, lo)
806 panic("no server for nid", lo)
810 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
811 except CommandError, e:
812 print "disconnect failed: ", self.name
816 lctl.del_route(self.net_type, self.nid, lo, hi)
817 except CommandError, e:
818 print "del_route failed: ", self.name
822 lctl.cleanup("RPCDEV", "")
823 except CommandError, e:
824 print "cleanup failed: ", self.name
828 lctl.disconnectAll(self.net_type)
829 except CommandError, e:
830 print "disconnectAll failed: ", self.name
833 if self.net_type == 'tcp':
834 # yikes, this ugly! need to save pid in /var/something
835 run("killall acceptor")
838 def __init__(self,dom_node):
839 Module.__init__(self, 'LDLM', dom_node)
840 self.add_module('lustre/ldlm', 'ldlm')
843 lctl.newdev(attach="ldlm %s %s" % (self.name, self.uuid),
847 def __init__(self,dom_node):
848 Module.__init__(self, 'LOV', dom_node)
849 self.mds_uuid = get_first_ref(dom_node, 'mds')
850 self.mdc_uuid = get_first_ref(dom_node, 'mdc')
851 mds= lookup(dom_node.parentNode, self.mds_uuid)
852 self.mdc = lookup(dom_node.parentNode, self.mdc_uuid)
854 self.mds_name = getName(mds)
856 devs = dom_node.getElementsByTagName('devices')
859 self.stripe_sz = get_attr_int(dev_node, 'stripesize', 65536)
860 self.stripe_off = get_attr_int(dev_node, 'stripeoffset', 0)
861 self.pattern = get_attr_int(dev_node, 'pattern', 0)
862 self.devlist = get_all_refs(dev_node, 'osc')
863 self.stripe_cnt = get_attr_int(dev_node, 'stripecount', len(self.devlist))
864 self.add_module('lustre/mdc', 'mdc')
865 self.add_module('lustre/lov', 'lov')
868 for osc_uuid in self.devlist:
869 osc = lookup(self.dom_node.parentNode, osc_uuid)
874 panic('osc not found:', osc_uuid)
875 mdc_uuid = get_first_ref(self.dom_node,'mdc')
876 self.info(mdc_uuid, self.stripe_cnt, self.stripe_sz, self.stripe_off, self.pattern, self.devlist,self.mds_name)
877 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid), setup ="%s" % (mdc_uuid))
880 for osc_uuid in self.devlist:
881 osc = lookup(self.dom_node.parentNode, osc_uuid)
886 panic('osc not found:', osc_uuid)
894 def load_module(self):
895 for osc_uuid in self.devlist:
896 osc = lookup(self.dom_node.parentNode, osc_uuid)
902 panic('osc not found:', osc_uuid)
903 Module.load_module(self)
905 def cleanup_module(self):
906 Module.cleanup_module(self)
907 for osc_uuid in self.devlist:
908 osc = lookup(self.dom_node.parentNode, osc_uuid)
914 panic('osc not found:', osc_uuid)
916 class LOVConfig(Module):
917 def __init__(self,dom_node):
918 Module.__init__(self, 'LOVConfig', dom_node)
919 self.lov_uuid = get_first_ref(dom_node, 'lov')
920 l = lookup(dom_node.parentNode, self.lov_uuid)
926 for i in range(len(lov.devlist)):
927 oscUUIDs=oscUUIDs+" "+lov.devlist[i]
928 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern, oscUUIDs)
930 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off, lov.pattern, oscUUIDs)
938 def __init__(self,dom_node):
939 Module.__init__(self, 'MDS', dom_node)
940 self.devname, self.size = get_device(dom_node)
941 self.fstype = get_text(dom_node, 'fstype')
942 self.format = get_text(dom_node, 'autoformat', "no")
943 if self.fstype == 'extN':
944 self.add_module('lustre/extN', 'extN')
945 self.add_module('lustre/mds', 'mds')
946 self.add_module('lustre/mds', 'mds_%s' % (self.fstype))
949 self.info(self.devname, self.fstype, self.format)
950 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
951 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
952 setup ="%s %s" %(blkdev, self.fstype))
955 clean_loop(self.devname)
958 # Very unusual case, as there is no MDC element in the XML anymore
959 # Builds itself from an MDS node
961 def __init__(self,dom_node):
962 Module.__init__(self, 'MDC', dom_node)
963 self.mds_uuid = get_first_ref(dom_node,'mds')
964 self.dom_node = dom_node
965 self.module_name = 'MDC'
966 self.kmodule_list = []
969 host = socket.gethostname()
971 self.net_uuid = get_first_ref(dom_node, 'network')
972 net = get_osc_net(dom_node,self.net_uuid)
973 self._server = Network(net)
974 self.add_module('lustre/mdc','mdc')
975 self.mdcname = get_attr(dom_node, 'name')
976 self.mdcuuid = get_attr(dom_node, 'uuid')
980 srv = self.get_server()
981 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
982 lctl.newdev(attach="mdc %s %s" % (self.name, self.uuid),
983 setup ="%s %s" %(self.mds_uuid, srv.uuid))
988 srv = self.get_server()
997 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
998 except CommandError, e:
999 print "del_route failed: ", self.name
1002 Module.cleanup(self)
1005 def __init__(self, dom_node):
1006 Module.__init__(self, 'OBD', dom_node)
1007 self.obdtype = get_attr(dom_node, 'type')
1008 self.devname, self.size = get_device(dom_node)
1009 self.fstype = get_text(dom_node, 'fstype')
1010 self.format = get_text(dom_node, 'autoformat', 'yes')
1011 if self.fstype == 'extN':
1012 self.add_module('lustre/extN', 'extN')
1013 self.add_module('lustre/' + self.obdtype, self.obdtype)
1015 # need to check /proc/mounts and /etc/mtab before
1016 # formatting anything.
1017 # FIXME: check if device is already formatted.
1019 self.info(self.obdtype, self.devname, self.size, self.fstype, self.format)
1020 if self.obdtype == 'obdecho':
1023 blkdev = block_dev(self.devname, self.size, self.fstype, self.format)
1024 lctl.newdev(attach="%s %s %s" % (self.obdtype, self.name, self.uuid),
1025 setup ="%s %s" %(blkdev, self.fstype))
1027 Module.cleanup(self)
1028 if not self.obdtype == 'obdecho':
1029 clean_loop(self.devname)
1032 def __init__(self,dom_node):
1033 Module.__init__(self, 'OST', dom_node)
1034 self.obd_uuid = get_first_ref(dom_node, 'obd')
1035 self.add_module('lustre/ost', 'ost')
1038 self.info(self.obd_uuid)
1039 lctl.newdev(attach="ost %s %s" % (self.name, self.uuid),
1040 setup ="%s" % (self.obd_uuid))
1043 # virtual interface for OSC and LOV
1045 def __init__(self,dom_node):
1046 Module.__init__(self, 'VOSC', dom_node)
1047 if dom_node.nodeName == 'lov':
1048 self.osc = LOV(dom_node)
1050 self.osc = OSC(dom_node)
1055 def load_module(self):
1056 self.osc.load_module()
1057 def cleanup_module(self):
1058 self.osc.cleanup_module()
1062 def __init__(self,dom_node):
1063 Module.__init__(self, 'OSC', dom_node)
1064 self.obd_uuid = get_first_ref(dom_node, 'obd')
1066 self.net_uuid = get_first_ref(dom_node,'network')
1067 net = get_osc_net(dom_node,self.net_uuid)
1068 self._server = Network(net)
1069 self.add_module('lustre/osc', 'osc')
1072 self.info(self.obd_uuid,self.uuid)
1073 srv = self.get_server()
1075 lctl.connect(srv.net_type, srv.nid, srv.port, srv.uuid, srv.send_mem, srv.recv_mem)
1079 lctl.add_route_host(r[0], srv.uuid, r[1], r[2])
1081 panic ("no route to", srv.nid)
1083 lctl.newdev(attach="osc %s %s" % (self.name, self.uuid),
1084 setup ="%s %s" %(self.obd_uuid, srv.uuid))
1087 srv = self.get_server()
1089 Module.cleanup(self)
1091 self.info(self.obd_uuid, self.ost_uuid)
1095 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1096 except CommandError, e:
1097 print "del_route failed: ", self.name
1100 Module.cleanup(self)
1104 class Mountpoint(Module):
1105 def __init__(self,dom_node):
1106 Module.__init__(self, 'MTPT', dom_node)
1107 self.dom_node = dom_node
1108 self.path = get_text(dom_node, 'path')
1109 self.mdc_uuid = get_first_ref(dom_node, 'mdc')
1110 self.lov_uuid = get_first_ref(dom_node, 'osc')
1111 self.add_module('lustre/mdc', 'mdc')
1112 self.add_module('lustre/llite', 'llite')
1113 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1115 m = lookup(self.dom_node.parentNode, self.mdc_uuid)
1121 self.info(self.path, self.mdc_uuid,self.lov_uuid)
1122 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s none %s" % \
1123 (self.lov_uuid, self.mdc_uuid, self.path)
1124 run("mkdir", self.path)
1127 panic("mount failed:", self.path)
1130 self.info(self.path, self.mdc_uuid,self.lov_uuid)
1132 (rc, out) = run("umount -f", self.path)
1134 (rc, out) = run("umount", self.path)
1136 log("umount failed, cleanup will most likely not work.")
1137 l = lookup(self.dom_node.parentNode, self.lov_uuid)
1141 def load_module(self):
1142 self.osc.load_module()
1143 Module.load_module(self)
1144 def cleanup_module(self):
1145 Module.cleanup_module(self)
1146 self.osc.cleanup_module()
1149 # ============================================================
1150 # XML processing and query
1151 # TODO: Change query funcs to use XPath, which is muc cleaner
1153 def get_device(obd):
1154 list = obd.getElementsByTagName('device')
1158 size = get_attr_int(dev, 'size', 0)
1159 return dev.firstChild.data, size
1162 # Get the text content from the first matching child
1163 # If there is no content (or it is all whitespace), return
1165 def get_text(dom_node, tag, default=""):
1166 list = dom_node.getElementsByTagName(tag)
1169 dom_node.normalize()
1170 if dom_node.firstChild:
1171 txt = string.strip(dom_node.firstChild.data)
1176 def get_text_int(dom_node, tag, default=0):
1177 list = dom_node.getElementsByTagName(tag)
1181 dom_node.normalize()
1182 if dom_node.firstChild:
1183 txt = string.strip(dom_node.firstChild.data)
1188 panic("text value is not integer:", txt)
1191 def get_attr(dom_node, attr, default=""):
1192 v = dom_node.getAttribute(attr)
1197 def get_attr_int(dom_node, attr, default=0):
1199 v = dom_node.getAttribute(attr)
1204 panic("attr value is not integer", v)
1207 def get_first_ref(dom_node, tag):
1208 """ Get the first uuidref of the type TAG. Used one only
1209 one is expected. Returns the uuid."""
1211 refname = '%s_ref' % tag
1212 list = dom_node.getElementsByTagName(refname)
1214 uuid = getRef(list[0])
1217 def get_all_refs(dom_node, tag):
1218 """ Get all the refs of type TAG. Returns list of uuids. """
1220 refname = '%s_ref' % tag
1221 list = dom_node.getElementsByTagName(refname)
1224 uuids.append(getRef(i))
1227 def get_ost_net(dom_node, uuid):
1228 ost = lookup(dom_node, uuid)
1229 uuid = get_first_ref(ost, 'network')
1232 return lookup(dom_node, uuid)
1234 def get_osc_net(dom_node, uuid):
1235 uuid = get_first_ref(dom_node,'network')
1238 return lookup(dom_node.parentNode,uuid)
1240 def nid2server(dom_node, nid):
1241 netlist = dom_node.getElementsByTagName('network')
1242 for net_node in netlist:
1243 if get_text(net_node, 'server') == nid:
1244 return Network(net_node)
1247 def lookup(dom_node, uuid):
1248 for n in dom_node.childNodes:
1249 if n.nodeType == n.ELEMENT_NODE:
1250 if getUUID(n) == uuid:
1257 # Get name attribute of dom_node
1258 def getName(dom_node):
1259 return dom_node.getAttribute('name')
1261 def getRef(dom_node):
1262 return dom_node.getAttribute('uuidref')
1264 # Get name attribute of dom_node
1265 def getUUID(dom_node):
1266 return dom_node.getAttribute('uuid')
1268 # the tag name is the service type
1269 # fixme: this should do some checks to make sure the dom_node is a service
1270 def getServiceType(dom_node):
1271 return dom_node.nodeName
1274 # determine what "level" a particular node is at.
1275 # the order of iniitailization is based on level.
1276 def getServiceLevel(dom_node):
1277 type = getServiceType(dom_node)
1279 if type in ('network',):
1281 elif type in ('device', 'ldlm'):
1283 elif type in ('obd', 'mdd'):
1285 elif type in ('mds','ost'):
1287 elif type in ('mdc','osc'):
1289 elif type in ('lov', 'lovconfig'):
1291 elif type in ('mountpoint',):
1294 if ret < config.startlevel() or ret > config.endlevel():
1301 # return list of services in a profile. list is a list of tuples
1302 # [(level, dom_node),]
1303 def getServices(lustreNode, profileNode):
1305 for n in profileNode.childNodes:
1306 if n.nodeType == n.ELEMENT_NODE:
1307 servNode = lookup(lustreNode, getRef(n))
1310 panic('service not found: ' + getRef(n))
1311 level = getServiceLevel(servNode)
1313 list.append((level, servNode))
1317 def getByName(lustreNode, name, tag):
1318 ndList = lustreNode.getElementsByTagName(tag)
1320 if getName(nd) == name:
1325 ############################################################
1327 # FIXME: clean this mess up!
1330 def prepare_mdc(dom_node, mds_uuid):
1332 mds_node = lookup(dom_node, mds_uuid);
1334 panic("no mds:", mds_uuid)
1344 ############################################################
1345 # routing ("rooting")
1351 def init_node(dom_node):
1352 global local_node, router_flag
1353 netlist = dom_node.getElementsByTagName('network')
1354 for dom_net in netlist:
1355 type = get_attr(dom_net, 'type')
1356 gw = get_text(dom_net, 'server')
1357 local_node.append((type, gw))
1359 def node_needs_router():
1362 def get_routes(type, gw, dom_net):
1363 """ Return the routes as a list of tuples of the form:
1364 [(type, gw, lo, hi),]"""
1366 tbl = dom_net.getElementsByTagName('route_tbl')
1368 routes = t.getElementsByTagName('route')
1370 lo = get_attr(r, 'lo')
1371 hi = get_attr(r, 'hi', '')
1372 res.append((type, gw, lo, hi))
1376 def init_route_config(lustre):
1377 """ Scan the lustre config looking for routers. Build list of
1379 global routes, router_flag
1381 list = lustre.getElementsByTagName('node')
1383 if get_attr(node, 'router'):
1385 for (local_type, local_nid) in local_node:
1387 netlist = node.getElementsByTagName('network')
1388 for dom_net in netlist:
1389 if local_type == get_attr(dom_net, 'type'):
1390 gw = get_text(dom_net, 'server')
1394 for dom_net in netlist:
1395 if local_type != get_attr(dom_net, 'type'):
1396 for route in get_routes(local_type, gw, dom_net):
1397 routes.append(route)
1402 for iface in local_node:
1403 if net.net_type == iface[0]:
1407 def find_route(net):
1408 global local_node, routes
1409 frm_type = local_node[0][0]
1410 to_type = net.net_type
1412 debug ('looking for route to', to_type,to)
1421 ############################################################
1424 def startService(dom_node, module_flag):
1425 type = getServiceType(dom_node)
1426 debug('Service:', type, getName(dom_node), getUUID(dom_node))
1427 # there must be a more dynamic way of doing this...
1429 if config._start or config._stop:
1430 if config._start == "client" or config._stop == "client":
1435 elif type == 'network':
1436 n = Network(dom_node)
1441 elif type == 'mountpoint':
1442 n = Mountpoint(dom_node)
1445 #panic ("unknown service type:", type)
1447 elif config._start == "MDS" or config._stop == "MDS":
1450 elif type == 'lovconfig':
1451 n = LOVConfig(dom_node)
1452 elif type == 'network':
1453 n = Network(dom_node)
1458 #panic ("unknown service type:", type)
1460 elif config._start == "OST" or config._stop == "OST":
1463 elif type == 'network':
1464 n = Network(dom_node)
1471 #panic ("unknown service type:", type)
1477 elif type == 'lovconfig':
1478 n = LOVConfig(dom_node)
1479 elif type == 'network':
1480 n = Network(dom_node)
1491 elif type == 'mountpoint':
1492 n = Mountpoint(dom_node)
1494 panic ("unknown service type:", type)
1499 if config.cleanup():
1506 if config.nosetup():
1508 if config.cleanup():
1516 # Prepare the system to run lustre using a particular profile
1517 # in a the configuration.
1518 # * load & the modules
1519 # * setup networking for the current node
1520 # * make sure partitions are in place and prepared
1521 # * initialize devices with lctl
1522 # Levels is important, and needs to be enforced.
1523 def startProfile(lustreNode, profileNode, module_flag):
1525 panic("profile:", profile, "not found.")
1526 services = getServices(lustreNode, profileNode)
1527 if config.cleanup():
1530 startService(s[1], module_flag)
1535 def doHost(lustreNode, hosts):
1539 dom_node = getByName(lustreNode, h, 'node')
1544 print 'No host entry found.'
1547 if not get_attr(dom_node, 'router'):
1549 init_route_config(lustreNode)
1554 # Two step process: (1) load modules, (2) setup lustre
1555 # if not cleaning, load modules first.
1556 module_flag = not config.cleanup()
1557 reflist = dom_node.getElementsByTagName('profile')
1558 for profile in reflist:
1559 startProfile(lustreNode, profile, module_flag)
1561 if not config.cleanup():
1562 sys_set_debug_path()
1563 script = config.gdb_script()
1564 run(lctl.lctl, ' modules >', script)
1566 # dump /tmp/ogdb and sleep/pause here
1567 log ("The GDB module script is in", script)
1570 module_flag = not module_flag
1571 for profile in reflist:
1572 startProfile(lustreNode, profile, module_flag)
1574 ############################################################
1575 # Command line processing
1577 def parse_cmdline(argv):
1578 short_opts = "hdnvf"
1579 long_opts = ["ldap", "reformat", "lustre=", "verbose", "gdb",
1580 "portals=", "makeldiff", "cleanup", "noexec",
1581 "help", "node=", "nomod", "nosetup",
1582 "dump=", "force", "start=", "stop=", "startlevel=", "endlevel="]
1586 opts, args = getopt.getopt(argv, short_opts, long_opts)
1588 except getopt.error:
1593 if o in ("-h", "--help"):
1595 if o in ("-d","--cleanup"):
1597 if o in ("-v", "--verbose"):
1599 if o in ("-n", "--noexec"):
1602 if o == "--portals":
1606 if o == "--reformat":
1614 if o == "--nosetup":
1618 if o in ("-f", "--force"):
1620 if o in ("--startlevel",):
1621 config.startlevel(a)
1622 if o in ("--endlevel",):
1636 s = urllib.urlopen(url)
1642 def setupModulePath(cmd):
1643 base = os.path.dirname(cmd)
1644 if os.access(base+"/Makefile", os.R_OK):
1645 config.src_dir(base + "/../../")
1647 def sys_set_debug_path():
1648 debug("debug path: ", config.debug_path())
1652 fp = open('/proc/sys/portals/debug_path', 'w')
1653 fp.write(config.debug_path())
1658 #/proc/sys/net/core/rmem_max
1659 #/proc/sys/net/core/wmem_max
1660 def sys_set_netmem_max(path, max):
1661 debug("setting", path, "to at least", max)
1669 fp = open(path, 'w')
1670 fp.write('%d\n' %(max))
1674 def sys_make_devices():
1675 if not os.access('/dev/portals', os.R_OK):
1676 run('mknod /dev/portals c 10 240')
1677 if not os.access('/dev/obd', os.R_OK):
1678 run('mknod /dev/obd c 10 241')
1681 # Add dir to the global PATH, if not already there.
1682 def add_to_path(new_dir):
1683 syspath = string.split(os.environ['PATH'], ':')
1684 if new_dir in syspath:
1686 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
1689 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
1690 # ensure basic elements are in the system path
1691 def sanitise_path():
1692 for dir in DEFAULT_PATH:
1697 # Initialize or shutdown lustre according to a configuration file
1698 # * prepare the system for lustre
1699 # * configure devices with lctl
1700 # Shutdown does steps in reverse
1703 global TCP_ACCEPTOR, lctl, MAXTCPBUF
1704 host = socket.gethostname()
1708 args = parse_cmdline(sys.argv[1:])
1709 if config.start() or config.stop():
1710 if not os.access("/etc/lustre/config.xml", os.R_OK ):
1711 print 'File not found or readable:', "/etc/lustre/config.xml"
1713 dom = xml.dom.minidom.parse("/etc/lustre/config.xml")
1715 if not os.access(args[0], os.R_OK):
1716 print 'File not found or readable:', args[0]
1718 dom = xml.dom.minidom.parse(args[0])
1720 xmldata = fetch(config.url())
1721 dom = xml.dom.minidom.parseString(xmldata)
1727 node_list.append(config.node())
1730 node_list.append(host)
1731 node_list.append('localhost')
1732 debug("configuring for host: ", node_list)
1735 config._debug_path = config._debug_path + '-' + host
1736 config._gdb_script = config._gdb_script + '-' + host
1738 TCP_ACCEPTOR = find_prog('acceptor')
1739 if not TCP_ACCEPTOR:
1741 TCP_ACCEPTOR = 'acceptor'
1742 debug('! acceptor not found')
1744 panic('acceptor not found')
1746 lctl = LCTLInterface('lctl')
1748 setupModulePath(sys.argv[0])
1750 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1751 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1752 doHost(dom.documentElement, node_list)
1754 if __name__ == "__main__":
1757 except LconfError, e:
1759 except CommandError, e:
1763 if first_cleanup_error:
1764 sys.exit(first_cleanup_error)