3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 1048576
55 # Maximum number of devices to search for.
56 # (the /dev/loop* nodes need to be created beforehand)
57 MAX_LOOP_DEVICES = 256
58 PORTALS_DIR = 'portals'
61 # Please keep these in sync with the values in portals/kp30.h
73 "warning" : (1 << 10),
77 "portals" : (1 << 14),
79 "dlmtrace" : (1 << 16),
83 "rpctrace" : (1 << 20),
84 "vfstrace" : (1 << 21),
88 "undefined" : (0 << 24),
97 "ext2obd" : (9 << 24),
98 "portals" : (10 << 24),
99 "socknal" : (11 << 24),
100 "qswnal" : (12 << 24),
101 "pinger" : (13 << 24),
102 "filter" : (14 << 24),
103 "trace" : (15 << 24),
107 "gmnal" : (19 << 24),
108 "ptlrouter" : (20 << 24),
110 "ptlbd" : (22 << 24),
116 first_cleanup_error = 0
117 def cleanup_error(rc):
118 global first_cleanup_error
119 if not first_cleanup_error:
120 first_cleanup_error = rc
122 # ============================================================
123 # debugging and error funcs
125 def fixme(msg = "this feature"):
126 raise Lustre.LconfError, msg + ' not implmemented yet.'
129 msg = string.join(map(str,args))
130 if not config.noexec:
131 raise Lustre.LconfError(msg)
136 msg = string.join(map(str,args))
141 print string.strip(s)
145 msg = string.join(map(str,args))
148 # ack, python's builtin int() does not support '0x123' syntax.
149 # eval can do it, although what a hack!
153 return eval(s, {}, {})
156 except SyntaxError, e:
157 raise ValueError("not a number")
159 raise ValueError("not a number")
161 # ============================================================
162 # locally defined exceptions
163 class CommandError (exceptions.Exception):
164 def __init__(self, cmd_name, cmd_err, rc=None):
165 self.cmd_name = cmd_name
166 self.cmd_err = cmd_err
171 if type(self.cmd_err) == types.StringType:
173 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
175 print "! %s: %s" % (self.cmd_name, self.cmd_err)
176 elif type(self.cmd_err) == types.ListType:
178 print "! %s (error %d):" % (self.cmd_name, self.rc)
180 print "! %s:" % (self.cmd_name)
181 for s in self.cmd_err:
182 print "> %s" %(string.strip(s))
187 # ============================================================
188 # handle daemons, like the acceptor
190 """ Manage starting and stopping a daemon. Assumes daemon manages
191 it's own pid file. """
193 def __init__(self, cmd):
199 log(self.command, "already running.")
201 self.path = find_prog(self.command)
203 panic(self.command, "not found.")
204 ret, out = runcmd(self.path +' '+ self.command_line())
206 raise CommandError(self.path, out, ret)
210 pid = self.read_pidfile()
212 log ("killing process", pid)
214 #time.sleep(1) # let daemon die
216 log("unable to kill", self.command, e)
218 log("unable to kill", self.command)
221 pid = self.read_pidfile()
231 def read_pidfile(self):
233 fp = open(self.pidfile(), 'r')
240 def clean_pidfile(self):
241 """ Remove a stale pidfile """
242 log("removing stale pidfile:", self.pidfile())
244 os.unlink(self.pidfile())
246 log(self.pidfile(), e)
248 class AcceptorHandler(DaemonHandler):
249 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg):
250 DaemonHandler.__init__(self, "acceptor")
253 self.send_mem = send_mem
254 self.recv_mem = recv_mem
256 if net_type == 'toe':
257 self.flags = self.flags + ' -N 4'
259 self.flags = self.flags + ' -i'
261 self.flags = self.flags + ' -x'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 for port in acceptors.keys():
274 daemon = acceptors[port]
275 if not daemon.running():
278 def run_one_acceptor(port):
279 if acceptors.has_key(port):
280 daemon = acceptors[port]
281 if not daemon.running():
284 panic("run_one_acceptor: No acceptor defined for port:", port)
286 def stop_acceptor(port):
287 if acceptors.has_key(port):
288 daemon = acceptors[port]
293 # ============================================================
294 # handle lctl interface
297 Manage communication with lctl
300 def __init__(self, cmd):
302 Initialize close by finding the lctl binary.
304 self.lctl = find_prog(cmd)
308 debug('! lctl not found')
311 raise CommandError('lctl', "unable to find lctl binary.")
313 def use_save_file(self, file):
314 self.save_file = file
316 def set_nonblock(self, fd):
317 fl = fcntl.fcntl(fd, F_GETFL)
318 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
323 the cmds are written to stdin of lctl
324 lctl doesn't return errors when run in script mode, so
326 should modify command line to accept multiple commands, or
327 create complex command line options
331 cmds = '\n dump ' + self.save_file + cmds
333 debug("+", cmd_line, cmds)
334 if config.noexec: return (0, [])
336 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
337 child.tochild.write(cmds + "\n")
338 child.tochild.close()
340 # From "Python Cookbook" from O'Reilly
341 outfile = child.fromchild
342 outfd = outfile.fileno()
343 self.set_nonblock(outfd)
344 errfile = child.childerr
345 errfd = errfile.fileno()
346 self.set_nonblock(errfd)
348 outdata = errdata = ''
351 ready = select.select([outfd,errfd],[],[]) # Wait for input
352 if outfd in ready[0]:
353 outchunk = outfile.read()
354 if outchunk == '': outeof = 1
355 outdata = outdata + outchunk
356 if errfd in ready[0]:
357 errchunk = errfile.read()
358 if errchunk == '': erreof = 1
359 errdata = errdata + errchunk
360 if outeof and erreof: break
361 # end of "borrowed" code
364 if os.WIFEXITED(ret):
365 rc = os.WEXITSTATUS(ret)
368 if rc or len(errdata):
369 raise CommandError(self.lctl, errdata, rc)
372 def runcmd(self, *args):
374 run lctl using the command line
376 cmd = string.join(map(str,args))
377 debug("+", self.lctl, cmd)
378 rc, out = run(self.lctl, cmd)
380 raise CommandError(self.lctl, out, rc)
384 def network(self, net, nid):
385 """ initialized network and add "self" """
389 quit """ % (net, nid)
392 # create a new connection
393 def connect(self, srv):
394 cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
395 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
405 add_autoconn %s %s %d %s""" % (cmds, srv.net_type,
408 srv.nid, srv.hostaddr, srv.port, flags )
410 cmds = cmds + "\n quit"
414 def recover(self, dev_name, new_conn):
418 recover %s""" %(dev_name, new_conn)
421 # add a route to a range
422 def add_route(self, net, gw, lo, hi):
430 except CommandError, e:
434 def del_route(self, net, gw, lo, hi):
439 quit """ % (net, gw, lo, hi)
442 # add a route to a host
443 def add_route_host(self, net, uuid, gw, tgt):
453 except CommandError, e:
457 # add a route to a range
458 def del_route_host(self, net, uuid, gw, tgt):
464 quit """ % (net, uuid, gw, tgt)
467 # disconnect one connection
468 def disconnect(self, srv):
469 cmds = " ignore_errors\n del_uuid %s" % (srv.uuid)
470 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
473 del_autoconn %s %s s""" % (cmds,
475 srv.nid, srv.hostaddr)
476 cmds = cmds + "\n quit"
480 def del_uuid(self, servuuid):
484 quit""" % (servuuid,)
488 def disconnectAll(self, net):
496 # create a new device with lctl
497 def newdev(self, attach, setup = ""):
502 quit""" % (attach, setup)
506 def cleanup(self, name, uuid, force, failover = 0):
507 if failover: force = 1
513 quit""" % (name, ('', 'force')[force],
514 ('', 'failover')[failover])
518 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
523 lov_setconfig %s %d %d %d %s %s
524 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
528 def dump(self, dump_file):
531 quit""" % (dump_file)
534 # get list of devices
535 def device_list(self):
537 rc, out = self.runcmd('device_list')
538 except CommandError, e:
546 def lustre_version(self):
547 rc, out = self.runcmd('version')
551 def mount_option(self, option):
556 # ============================================================
557 # Various system-level functions
558 # (ideally moved to their own module)
560 # Run a command and return the output and status.
561 # stderr is sent to /dev/null, could use popen3 to
562 # save it if necessary
565 if config.noexec: return (0, [])
566 f = os.popen(cmd + ' 2>&1')
576 cmd = string.join(map(str,args))
579 # Run a command in the background.
580 def run_daemon(*args):
581 cmd = string.join(map(str,args))
583 if config.noexec: return 0
584 f = os.popen(cmd + ' 2>&1')
592 # Determine full path to use for an external command
593 # searches dirname(argv[0]) first, then PATH
595 syspath = string.split(os.environ['PATH'], ':')
596 cmdpath = os.path.dirname(sys.argv[0])
597 syspath.insert(0, cmdpath);
599 syspath.insert(0, os.path.join(config.portals, 'utils/'))
601 prog = os.path.join(d,cmd)
602 if os.access(prog, os.X_OK):
606 # Recursively look for file starting at base dir
607 def do_find_file(base, mod):
608 fullname = os.path.join(base, mod)
609 if os.access(fullname, os.R_OK):
611 for d in os.listdir(base):
612 dir = os.path.join(base,d)
613 if os.path.isdir(dir):
614 module = do_find_file(dir, mod)
618 def find_module(src_dir, dev_dir, modname):
619 mod = '%s.o' % (modname)
620 module = src_dir +'/'+ dev_dir +'/'+ mod
622 if os.access(module, os.R_OK):
628 # is the path a block device?
635 return stat.S_ISBLK(s[stat.ST_MODE])
637 # build fs according to type
639 def mkfs(dev, devsize, fstype,jsize):
644 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
646 # devsize is in 1k, and fs block count is in 4k
647 block_cnt = devsize/4
649 if fstype in ('ext3', 'extN'):
650 # ext3 journal size is in megabytes
651 if jsize: jopt = "-J size=%d" %(jsize,)
652 mkfs = 'mkfs.ext2 -j -b 4096 -F '
653 elif fstype == 'reiserfs':
654 # reiserfs journal size is in blocks
655 if jsize: jopt = "--journal_size %d" %(jsize,)
656 mkfs = 'mkreiserfs -ff'
658 print 'unsupported fs type: ', fstype
660 (ret, out) = run (mkfs, jopt, dev, block_cnt)
662 panic("Unable to build fs:", dev, string.join(out))
663 # enable hash tree indexing on fsswe
664 if fstype in ('ext3', 'extN'):
665 htree = 'echo "feature FEATURE_C5" | debugfs -w'
666 (ret, out) = run (htree, dev)
668 panic("Unable to enable htree:", dev)
670 # some systems use /dev/loopN, some /dev/loop/N
674 if not os.access(loop + str(0), os.R_OK):
676 if not os.access(loop + str(0), os.R_OK):
677 panic ("can't access loop devices")
680 # find loop device assigned to thefile
683 for n in xrange(0, MAX_LOOP_DEVICES):
685 if os.access(dev, os.R_OK):
686 (stat, out) = run('losetup', dev)
687 if out and stat == 0:
688 m = re.search(r'\((.*)\)', out[0])
689 if m and file == m.group(1):
695 # create file if necessary and assign the first free loop device
696 def init_loop(file, size, fstype, journal_size):
697 dev = find_loop(file)
699 print 'WARNING file:', file, 'already mapped to', dev
701 if config.reformat or not os.access(file, os.R_OK | os.W_OK):
703 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
704 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
707 panic("Unable to create backing store:", file)
708 mkfs(file, size, fstype, journal_size)
711 # find next free loop
712 for n in xrange(0, MAX_LOOP_DEVICES):
714 if os.access(dev, os.R_OK):
715 (stat, out) = run('losetup', dev)
717 run('losetup', dev, file)
720 print "out of loop devices"
722 print "out of loop devices"
725 # undo loop assignment
726 def clean_loop(file):
727 dev = find_loop(file)
729 ret, out = run('losetup -d', dev)
731 log('unable to clean loop device:', dev, 'for file:', file)
734 # determine if dev is formatted as a <fstype> filesystem
735 def need_format(fstype, dev):
736 # FIXME don't know how to implement this
739 # initialize a block device if needed
740 def block_dev(dev, size, fstype, format, journal_size):
741 if config.noexec: return dev
742 if not is_block(dev):
743 dev = init_loop(dev, size, fstype, journal_size)
744 elif config.reformat or (need_format(fstype, dev) and format == 'yes'):
745 mkfs(dev, size, fstype, journal_size)
748 # panic("device:", dev,
749 # "not prepared, and autoformat is not set.\n",
750 # "Rerun with --reformat option to format ALL filesystems")
755 """lookup IP address for an interface"""
756 rc, out = run("/sbin/ifconfig", iface)
759 addr = string.split(out[1])[1]
760 ip = string.split(addr, ':')[1]
763 def sys_get_local_nid(net_type, wildcard, cluster_id):
764 """Return the local nid."""
766 if os.access('/proc/elan/device0/position', os.R_OK):
767 local = sys_get_local_address('elan', '*', cluster_id)
769 local = sys_get_local_address(net_type, wildcard, cluster_id)
772 def sys_get_local_address(net_type, wildcard, cluster_id):
773 """Return the local address for the network type."""
775 if net_type in ('tcp', 'toe'):
777 iface, star = string.split(wildcard, ':')
778 local = if2addr(iface)
780 panic ("unable to determine ip for:", wildcard)
782 host = socket.gethostname()
783 local = socket.gethostbyname(host)
784 elif net_type == 'elan':
785 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
787 fp = open('/proc/elan/device0/position', 'r')
788 lines = fp.readlines()
796 nid = my_int(cluster_id) + my_int(elan_id)
798 except ValueError, e:
802 elif net_type == 'gm':
803 fixme("automatic local address for GM")
804 elif net_type == 'scimac':
805 scinode="/opt/scali/sbin/scinode"
806 if os.path.exists(scinode):
807 (rc,local) = run(scinode)
809 panic (scinode, " not found on node with scimac networking")
811 panic (scinode, " failed")
812 local=string.rstrip(local[0])
816 # XXX: instead of device_list, ask for $name and see what we get
817 def is_prepared(name):
818 """Return true if a device exists for the name"""
821 if config.noexec and config.cleanup:
824 # expect this format:
825 # 1 UP ldlm ldlm ldlm_UUID 2
826 out = lctl.device_list()
828 if name == string.split(s)[3]:
830 except CommandError, e:
834 def is_network_prepared():
835 """If the LDLM device exists, then assume that all networking
836 has been configured"""
837 return is_prepared('ldlm')
839 def fs_is_mounted(path):
840 """Return true if path is a mounted lustre filesystem"""
842 fp = open('/proc/mounts')
843 lines = fp.readlines()
847 if a[1] == path and a[2] == 'lustre_lite':
854 # ============================================================
855 # Classes to prepare and cleanup the various objects
858 """ Base class for the rest of the modules. The default cleanup method is
859 defined here, as well as some utilitiy funcs.
861 def __init__(self, module_name, db):
863 self.module_name = module_name
864 self.name = self.db.getName()
865 self.uuid = self.db.getUUID()
866 self.kmodule_list = []
870 def info(self, *args):
871 msg = string.join(map(str,args))
872 print self.module_name + ":", self.name, self.uuid, msg
875 """ default cleanup, used for most modules """
878 lctl.cleanup(self.name, self.uuid, config.force)
879 except CommandError, e:
880 log(self.module_name, "cleanup failed: ", self.name)
884 def add_portals_module(self, dev_dir, modname):
885 """Append a module to list of modules to load."""
886 self.kmodule_list.append((config.portals, dev_dir, modname))
888 def add_lustre_module(self, dev_dir, modname):
889 """Append a module to list of modules to load."""
890 self.kmodule_list.append((config.lustre, dev_dir, modname))
892 def mod_loaded(self, modname):
893 """Check if a module is already loaded. Look in /proc/modules for it."""
894 fp = open('/proc/modules')
895 lines = fp.readlines()
897 # please forgive my tired fingers for this one
898 ret = filter(lambda word, mod=modname: word == mod,
899 map(lambda line: string.split(line)[0], lines))
902 def load_module(self):
903 """Load all the modules in the list in the order they appear."""
904 for src_dir, dev_dir, mod in self.kmodule_list:
905 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
906 if self.mod_loaded(mod) and not config.noexec:
908 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
910 module = find_module(src_dir, dev_dir, mod)
912 panic('module not found:', mod)
913 (rc, out) = run('/sbin/insmod', module)
915 raise CommandError('insmod', out, rc)
917 (rc, out) = run('/sbin/modprobe', mod)
919 raise CommandError('modprobe', out, rc)
921 def cleanup_module(self):
922 """Unload the modules in the list in reverse order."""
923 if not self.safe_to_clean():
925 rev = self.kmodule_list
927 for src_dir, dev_dir, mod in rev:
928 if not self.mod_loaded(mod) and not config.noexec:
931 if mod == 'portals' and config.dump:
932 lctl.dump(config.dump)
933 log('unloading module:', mod)
934 (rc, out) = run('/sbin/rmmod', mod)
936 log('! unable to unload module:', mod)
939 def safe_to_clean(self):
942 def safe_to_clean_modules(self):
943 return self.safe_to_clean()
945 class Network(Module):
946 def __init__(self,db):
947 Module.__init__(self, 'NETWORK', db)
948 self.net_type = self.db.get_val('nettype')
949 self.nid = self.db.get_val('nid', '*')
950 self.cluster_id = self.db.get_val('clusterid', "0")
951 self.port = self.db.get_val_int('port', 0)
952 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
953 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
954 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
955 self.nid_exchange = self.db.get_val_int('nidexchange', 0)
958 if self.nid_exchange:
959 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
961 self.nid = sys_get_local_address(self.net_type, self.nid, self.cluster_id)
963 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
964 debug("nid:", self.nid)
966 self.hostaddr = self.db.get_val('hostaddr', self.nid)
967 if '*' in self.hostaddr:
968 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
969 if not self.hostaddr:
970 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
971 debug("hostaddr:", self.hostaddr)
973 self.add_portals_module("libcfs", 'portals')
974 if node_needs_router():
975 self.add_portals_module("router", 'kptlrouter')
976 if self.net_type == 'tcp':
977 self.add_portals_module("knals/socknal", 'ksocknal')
978 if self.net_type == 'toe':
979 self.add_portals_module("knals/toenal", 'ktoenal')
980 if self.net_type == 'elan':
981 self.add_portals_module("knals/qswnal", 'kqswnal')
982 if self.net_type == 'gm':
983 self.add_portals_module("knals/gmnal", 'kgmnal')
984 if self.net_type == 'scimac':
985 self.add_portals_module("knals/scimacnal", 'kscimacnal')
988 if is_network_prepared():
990 self.info(self.net_type, self.nid, self.port)
991 lctl.network(self.net_type, self.nid)
992 if self.port and node_is_router():
993 run_one_acceptor(self.port)
994 self.connect_peer_gateways()
996 def connect_peer_gateways(self):
997 for router in self.db.lookup_class('node'):
998 if router.get_val_int('router', 0):
999 for netuuid in router.get_networks():
1000 net = self.db.lookup(netuuid)
1002 if (gw.cluster_id == self.cluster_id and
1003 gw.net_type == self.net_type):
1004 if gw.nid != self.nid:
1007 def disconnect_peer_gateways(self):
1008 for router in self.db.lookup_class('node'):
1009 if router.get_val_int('router', 0):
1010 for netuuid in router.get_networks():
1011 net = self.db.lookup(netuuid)
1013 if (gw.cluster_id == self.cluster_id and
1014 gw.net_type == self.net_type):
1015 if gw.nid != self.nid:
1018 except CommandError, e:
1019 print "disconnect failed: ", self.name
1023 def safe_to_clean(self):
1024 return not is_network_prepared()
1027 self.info(self.net_type, self.nid, self.port)
1029 stop_acceptor(self.port)
1030 if node_is_router():
1031 self.disconnect_peer_gateways()
1034 lctl.disconnectAll(self.net_type)
1035 except CommandError, e:
1036 print "disconnectAll failed: ", self.name
1040 class RouteTable(Module):
1041 def __init__(self,db):
1042 Module.__init__(self, 'ROUTES', db)
1044 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi):
1045 # only setup connections for tcp NALs
1047 if not net_type in ('tcp', 'toe'):
1050 # connect to target if route is to single node and this node is the gw
1051 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1052 if not local_cluster(net_type, tgt_cluster_id):
1053 panic("target", lo, " not on the local cluster")
1054 srvdb = self.db.nid2server(lo, net_type)
1055 # connect to gateway if this node is not the gw
1056 elif (local_cluster(net_type, gw_cluster_id)
1057 and not local_interface(net_type, gw_cluster_id, gw)):
1058 srvdb = self.db.nid2server(gw, net_type)
1063 panic("no server for nid", lo)
1066 return Network(srvdb)
1069 if is_network_prepared():
1072 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1073 lctl.add_route(net_type, gw, lo, hi)
1074 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1078 def safe_to_clean(self):
1079 return not is_network_prepared()
1082 if is_network_prepared():
1083 # the network is still being used, don't clean it up
1085 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1086 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1089 lctl.disconnect(srv)
1090 except CommandError, e:
1091 print "disconnect failed: ", self.name
1096 lctl.del_route(net_type, gw, lo, hi)
1097 except CommandError, e:
1098 print "del_route failed: ", self.name
1102 class Management(Module):
1103 def __init__(self, db):
1104 Module.__init__(self, 'MGMT', db)
1105 self.add_lustre_module('obdclass', 'obdclass')
1106 self.add_lustre_module('ptlrpc', 'ptlrpc')
1107 self.add_lustre_module('ldlm', 'ldlm')
1108 self.add_lustre_module('mgmt', 'mgmt_svc')
1111 if is_prepared(self.name):
1114 lctl.newdev(attach="mgmt %s %s" % (self.name, self.uuid))
1116 def safe_to_clean(self):
1120 if is_prepared(self.name):
1121 Module.cleanup(self)
1124 def __init__(self,db):
1125 Module.__init__(self, 'LDLM', db)
1126 self.add_lustre_module('obdclass', 'obdclass')
1127 self.add_lustre_module('ptlrpc', 'ptlrpc')
1128 self.add_lustre_module('ldlm', 'ldlm')
1131 if is_prepared(self.name):
1134 lctl.newdev(attach="ldlm %s %s" % ('ldlm', 'ldlm_UUID'))
1136 def safe_to_clean(self):
1137 out = lctl.device_list()
1138 return len(out) <= 1
1141 if is_prepared(self.name):
1142 Module.cleanup(self)
1145 def __init__(self, db, uuid, fs_name):
1146 Module.__init__(self, 'LOV', db)
1147 self.add_lustre_module('mdc', 'mdc')
1148 self.add_lustre_module('lov', 'lov')
1149 self.mds_uuid = self.db.get_first_ref('mds')
1150 mds= self.db.lookup(self.mds_uuid)
1151 self.mds_name = mds.getName()
1152 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1153 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1154 self.pattern = self.db.get_val_int('stripepattern', 0)
1155 self.devlist = self.db.get_refs('obd')
1156 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1158 self.client_uuid = generate_client_uuid(self.name)
1159 self.fs_name = fs_name
1161 self.mdc = get_mdc(db, self.client_uuid, fs_name, self.mds_uuid)
1162 for obd_uuid in self.devlist:
1163 obd = self.db.lookup(obd_uuid)
1164 osc = get_osc(obd, self.client_uuid, fs_name)
1166 self.osclist.append(osc)
1168 panic('osc not found:', obd_uuid)
1171 if is_prepared(self.name):
1173 for osc in self.osclist:
1175 # Only ignore connect failures with --force, which
1176 # isn't implemented here yet.
1177 osc.prepare(ignore_connect_failure=0)
1178 except CommandError, e:
1179 print "Error preparing OSC %s\n" % osc.uuid
1182 self.mdc_name = self.mdc.name
1183 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1184 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1185 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
1186 setup ="%s" % (self.mdc_name))
1189 if is_prepared(self.name):
1190 Module.cleanup(self)
1191 for osc in self.osclist:
1193 mdc = get_mdc(self.db, self.client_uuid, self.fs_name, self.mds_uuid)
1196 def load_module(self):
1197 for osc in self.osclist:
1200 Module.load_module(self)
1202 def cleanup_module(self):
1203 Module.cleanup_module(self)
1204 for osc in self.osclist:
1205 osc.cleanup_module()
1208 class LOVConfig(Module):
1209 def __init__(self, db):
1210 Module.__init__(self, 'LOVConfig', db)
1212 self.lov_uuid = self.db.get_first_ref('lov')
1213 l = self.db.lookup(self.lov_uuid)
1214 self.lov = LOV(l, "YOU_SHOULD_NEVER_SEE_THIS_UUID", '')
1218 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
1219 lov.pattern, lov.devlist, lov.mds_name)
1220 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
1221 lov.stripe_sz, lov.stripe_off, lov.pattern,
1222 string.join(lov.devlist))
1228 class MDSDEV(Module):
1229 def __init__(self,db):
1230 Module.__init__(self, 'MDSDEV', db)
1231 self.devpath = self.db.get_val('devpath','')
1232 self.size = self.db.get_val_int('devsize', 0)
1233 self.journal_size = self.db.get_val_int('journalsize', 0)
1234 self.fstype = self.db.get_val('fstype', '')
1235 self.nspath = self.db.get_val('nspath', '')
1236 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1237 target_uuid = self.db.get_first_ref('target')
1238 mds = self.db.lookup(target_uuid)
1239 self.name = mds.getName()
1240 self.lovconfig_uuids = mds.get_refs('lovconfig')
1241 self.filesystem_uuids = mds.get_refs('filesystem')
1242 # FIXME: if fstype not set, then determine based on kernel version
1243 self.format = self.db.get_val('autoformat', "no")
1244 if mds.get_val('failover', 0):
1245 self.failover_mds = 'f'
1247 self.failover_mds = 'n'
1248 active_uuid = get_active_target(mds)
1250 panic("No target device found:", target_uuid)
1251 if active_uuid == self.uuid:
1255 if self.active and config.group and config.group != ost.get_val('group'):
1258 self.target_dev_uuid = self.uuid
1259 self.uuid = target_uuid
1261 self.add_lustre_module('mds', 'mds')
1263 self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
1265 def load_module(self):
1267 Module.load_module(self)
1270 if is_prepared(self.name):
1273 debug(self.uuid, "not active")
1275 self.info(self.devpath, self.fstype, self.format)
1277 blkdev = block_dev(self.devpath, self.size, self.fstype, self.format,
1279 if not is_prepared('MDT'):
1280 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
1283 run ("mkdir", self.nspath)
1284 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
1285 setup ="%s %s %s" %(blkdev, self.fstype, self.nspath))
1286 for uuid in self.lovconfig_uuids:
1287 db = self.db.lookup(uuid)
1288 lovconfig = LOVConfig(db)
1290 if config.mds_ost_conn:
1291 for uuid in self.filesystem_uuids:
1292 log("open clients for filesystem:", uuid)
1293 fs = self.db.lookup(uuid)
1294 obd_uuid = fs.get_first_ref('obd')
1295 client_uuid = generate_client_uuid(self.name)
1296 client = VOSC(client_uuid, self.db.lookup(obd_uuid), self.name)
1300 def msd_remaining(self):
1301 out = lctl.device_list()
1303 if string.split(s)[2] in ('mds',):
1306 def safe_to_clean(self):
1309 def safe_to_clean_modules(self):
1310 return not self.msd_remaining()
1314 debug(self.uuid, "not active")
1316 if is_prepared(self.name):
1319 lctl.cleanup(self.name, self.uuid, config.force,
1321 except CommandError, e:
1322 log(self.module_name, "cleanup failed: ", self.name)
1325 Module.cleanup(self)
1326 if config.mds_ost_conn:
1327 for uuid in self.filesystem_uuids:
1328 log("clean clients for filesystem:", uuid)
1329 log("open clients for filesystem:", uuid)
1330 fs = self.db.lookup(uuid)
1331 obd_uuid = fs.get_first_ref('obd')
1332 client = VOSC(self.db.lookup(obd_uuid), self.name)
1334 if not self.msd_remaining() and is_prepared('MDT'):
1336 lctl.cleanup("MDT", "MDT_UUID", config.force,
1338 except CommandError, e:
1339 print "cleanup failed: ", self.name
1342 clean_loop(self.devpath)
1345 def __init__(self, db):
1346 Module.__init__(self, 'OSD', db)
1347 self.osdtype = self.db.get_val('osdtype')
1348 self.devpath = self.db.get_val('devpath', '')
1349 self.size = self.db.get_val_int('devsize', 0)
1350 self.journal_size = self.db.get_val_int('journalsize', 0)
1351 self.fstype = self.db.get_val('fstype', '')
1352 self.nspath = self.db.get_val('nspath', '')
1353 target_uuid = self.db.get_first_ref('target')
1354 ost = self.db.lookup(target_uuid)
1355 self.name = ost.getName()
1356 self.format = self.db.get_val('autoformat', 'yes')
1357 if ost.get_val('failover', 0):
1358 self.failover_ost = 'f'
1360 self.failover_ost = 'n'
1362 active_uuid = get_active_target(ost)
1364 panic("No target device found:", target_uuid)
1365 if active_uuid == self.uuid:
1369 if self.active and config.group and config.group != ost.get_val('group'):
1372 self.target_dev_uuid = self.uuid
1373 self.uuid = target_uuid
1375 self.add_lustre_module('ost', 'ost')
1376 # FIXME: should we default to ext3 here?
1378 self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
1379 self.add_lustre_module(self.osdtype, self.osdtype)
1381 def load_module(self):
1383 Module.load_module(self)
1385 # need to check /proc/mounts and /etc/mtab before
1386 # formatting anything.
1387 # FIXME: check if device is already formatted.
1389 if is_prepared(self.name):
1392 debug(self.uuid, "not active")
1394 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1395 self.format, self.journal_size)
1397 if self.osdtype == 'obdecho':
1400 blkdev = block_dev(self.devpath, self.size, self.fstype,
1401 self.format, self.journal_size)
1403 run ("mkdir", self.nspath)
1404 lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
1405 setup ="%s %s %s %s" %(blkdev, self.fstype,
1406 self.failover_ost, self.nspath))
1407 if not is_prepared('OSS'):
1408 lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
1411 def osd_remaining(self):
1412 out = lctl.device_list()
1414 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1417 def safe_to_clean(self):
1420 def safe_to_clean_modules(self):
1421 return not self.osd_remaining()
1425 debug(self.uuid, "not active")
1427 if is_prepared(self.name):
1430 lctl.cleanup(self.name, self.uuid, config.force,
1432 except CommandError, e:
1433 log(self.module_name, "cleanup failed: ", self.name)
1436 if not self.osd_remaining() and is_prepared('OSS'):
1438 lctl.cleanup("OSS", "OSS_UUID", config.force,
1440 except CommandError, e:
1441 print "cleanup failed: ", self.name
1444 if not self.osdtype == 'obdecho':
1445 clean_loop(self.devpath)
1447 def mgmt_uuid_for_fs(mtpt_name):
1450 mtpt_db = toplevel.lookup_name(mtpt_name)
1451 fs_uuid = mtpt_db.get_first_ref('filesystem')
1452 fs = toplevel.lookup(fs_uuid)
1455 return fs.get_first_ref('mgmt')
1457 # Generic client module, used by OSC and MDC
1458 class Client(Module):
1459 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1461 self.target_name = tgtdb.getName()
1462 self.target_uuid = tgtdb.getUUID()
1465 self.tgt_dev_uuid = get_active_target(tgtdb)
1466 if not self.tgt_dev_uuid:
1467 panic("No target device found for target:", self.target_name)
1469 self.kmodule_list = []
1473 self.module = module
1474 self.module_name = string.upper(module)
1476 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1477 self.target_name, fs_name)
1479 self.name = self_name
1481 self.lookup_server(self.tgt_dev_uuid)
1482 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1484 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1487 self.fs_name = fs_name
1490 self.add_lustre_module(module_dir, module)
1492 def lookup_server(self, srv_uuid):
1493 """ Lookup a server's network information """
1494 self._server_nets = get_ost_net(self.db, srv_uuid)
1495 if len(self._server_nets) == 0:
1496 panic ("Unable to find a server for:", srv_uuid)
1498 def get_servers(self):
1499 return self._server_nets
1501 def prepare(self, ignore_connect_failure = 0):
1502 self.info(self.target_uuid)
1503 if is_prepared(self.name):
1506 srv = choose_local_server(self.get_servers())
1510 srv, r = find_route(self.get_servers())
1512 lctl.add_route_host(r[0], srv.uuid, r[1], r[3])
1514 panic ("no route to", self.target_uuid)
1515 except CommandError, e:
1516 if not ignore_connect_failure:
1519 lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
1520 setup ="%s %s %s" % (self.target_uuid, srv.uuid,
1524 if is_prepared(self.name):
1525 Module.cleanup(self)
1527 srv = choose_local_server(self.get_servers())
1529 lctl.disconnect(srv)
1531 srv, r = find_route(self.get_servers())
1533 lctl.del_route_host(r[0], srv.uuid, r[1], r[3])
1534 except CommandError, e:
1535 log(self.module_name, "cleanup failed: ", self.name)
1541 def __init__(self, db, uuid, fs_name):
1542 Client.__init__(self, db, uuid, 'mdc', fs_name)
1546 def __init__(self, db, uuid, fs_name):
1547 Client.__init__(self, db, uuid, 'osc', fs_name)
1549 def mgmtcli_name_for_uuid(uuid):
1550 return 'MGMTCLI_%s' % uuid
1552 class ManagementClient(Client):
1553 def __init__(self, db, uuid):
1554 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1555 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1556 module_dir = 'mgmt')
1559 def __init__(self, db):
1560 Module.__init__(self, 'COBD', db)
1561 self.real_uuid = self.db.get_first_ref('realobd')
1562 self.cache_uuid = self.db.get_first_ref('cacheobd')
1563 self.add_lustre_module('cobd' , 'cobd')
1565 # need to check /proc/mounts and /etc/mtab before
1566 # formatting anything.
1567 # FIXME: check if device is already formatted.
1569 if is_prepared(self.name):
1571 self.info(self.real_uuid, self.cache_uuid)
1572 lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid),
1573 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1576 # virtual interface for OSC and LOV
1578 def __init__(self, db, uuid, fs_name):
1579 Module.__init__(self, 'VOSC', db)
1580 if db.get_class() == 'lov':
1581 self.osc = LOV(db, uuid, fs_name)
1583 self.osc = get_osc(db, uuid, fs_name)
1585 return self.osc.uuid
1587 return self.osc.name
1592 def load_module(self):
1593 self.osc.load_module()
1594 def cleanup_module(self):
1595 self.osc.cleanup_module()
1597 return self.db.get_class() != 'lov'
1598 def get_mdc_name(self):
1599 if self.db.get_class() == 'lov':
1600 return self.osc.mdc_name
1604 class ECHO_CLIENT(Module):
1605 def __init__(self,db):
1606 Module.__init__(self, 'ECHO_CLIENT', db)
1607 self.add_lustre_module('obdecho', 'obdecho')
1608 self.obd_uuid = self.db.get_first_ref('obd')
1609 obd = self.db.lookup(self.obd_uuid)
1610 self.uuid = generate_client_uuid(self.name)
1611 self.osc = VOSC(obd, self.uuid, self.name)
1614 if is_prepared(self.name):
1617 self.osc.prepare() # XXX This is so cheating. -p
1618 self.info(self.obd_uuid)
1620 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1621 setup = self.osc.get_name())
1624 if is_prepared(self.name):
1625 Module.cleanup(self)
1628 def load_module(self):
1629 self.osc.load_module()
1630 Module.load_module(self)
1632 def cleanup_module(self):
1633 Module.cleanup_module(self)
1634 self.osc.cleanup_module()
1637 def generate_client_uuid(name):
1638 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1640 int(random.random() * 1048576),
1641 int(random.random() * 1048576))
1642 return client_uuid[:36]
1645 class Mountpoint(Module):
1646 def __init__(self,db):
1647 Module.__init__(self, 'MTPT', db)
1648 self.path = self.db.get_val('path')
1649 self.fs_uuid = self.db.get_first_ref('filesystem')
1650 fs = self.db.lookup(self.fs_uuid)
1651 self.mds_uuid = fs.get_first_ref('mds')
1652 self.obd_uuid = fs.get_first_ref('obd')
1653 self.mgmt_uuid = fs.get_first_ref('mgmt')
1654 obd = self.db.lookup(self.obd_uuid)
1655 client_uuid = generate_client_uuid(self.name)
1656 self.vosc = VOSC(obd, client_uuid, self.name)
1657 if self.vosc.need_mdc():
1658 self.add_lustre_module('mdc', 'mdc')
1659 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1660 self.add_lustre_module('llite', 'llite')
1662 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1668 if fs_is_mounted(self.path):
1669 log(self.path, "already mounted.")
1673 self.mgmtcli.prepare()
1675 if self.vosc.need_mdc():
1677 mdc_name = self.mdc.name
1679 mdc_name = self.vosc.get_mdc_name()
1682 panic("Unable to determine MDC name. Probably need to cleanup before re-mounting.")
1683 self.info(self.path, self.mds_uuid, self.obd_uuid)
1684 if config.lctl_dump:
1685 cmd = "osc=%s,mdc=%s" % (self.vosc.get_name(), mdc_name)
1686 lctl.mount_option(cmd)
1688 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1689 (self.vosc.get_name(), mdc_name, config.config, self.path)
1690 run("mkdir", self.path)
1694 if self.vosc.need_mdc():
1696 panic("mount failed:", self.path, ":", string.join(val))
1699 self.info(self.path, self.mds_uuid,self.obd_uuid)
1700 if fs_is_mounted(self.path):
1702 (rc, out) = run("umount", "-f", self.path)
1704 (rc, out) = run("umount", self.path)
1706 raise CommandError('umount', out, rc)
1708 if fs_is_mounted(self.path):
1709 panic("fs is still mounted:", self.path)
1712 if self.vosc.need_mdc():
1715 self.mgmtcli.cleanup()
1717 def load_module(self):
1719 self.mgmtcli.load_module()
1720 self.vosc.load_module()
1721 Module.load_module(self)
1723 def cleanup_module(self):
1724 Module.cleanup_module(self)
1725 self.vosc.cleanup_module()
1727 self.mgmtcli.cleanup_module()
1730 # ============================================================
1731 # misc query functions
1733 def get_ost_net(self, osd_uuid):
1737 osd = self.lookup(osd_uuid)
1738 node_uuid = osd.get_first_ref('node')
1739 node = self.lookup(node_uuid)
1741 panic("unable to find node for osd_uuid:", osd_uuid,
1742 " node_ref:", node_uuid)
1743 for net_uuid in node.get_networks():
1744 db = node.lookup(net_uuid)
1745 srv_list.append(Network(db))
1749 # the order of iniitailization is based on level.
1750 def getServiceLevel(self):
1751 type = self.get_class()
1753 if type in ('network',):
1755 elif type in ('routetbl',):
1757 elif type in ('ldlm',):
1759 elif type in ('mgmt',):
1761 elif type in ('osd', 'cobd'):
1763 elif type in ('mdsdev',):
1765 elif type in ('mountpoint', 'echoclient'):
1768 panic("Unknown type: ", type)
1770 if ret < config.minlevel or ret > config.maxlevel:
1775 # return list of services in a profile. list is a list of tuples
1776 # [(level, db_object),]
1777 def getServices(self):
1779 for ref_class, ref_uuid in self.get_all_refs():
1780 servdb = self.lookup(ref_uuid)
1782 level = getServiceLevel(servdb)
1784 list.append((level, servdb))
1786 panic('service not found: ' + ref_uuid)
1792 ############################################################
1794 # FIXME: clean this mess up!
1796 # OSC is no longer in the xml, so we have to fake it.
1797 # this is getting ugly and begging for another refactoring
1798 def get_osc(ost_db, uuid, fs_name):
1799 osc = OSC(ost_db, uuid, fs_name)
1802 def get_mdc(db, uuid, fs_name, mds_uuid):
1803 mds_db = db.lookup(mds_uuid);
1805 panic("no mds:", mds_uuid)
1806 mdc = MDC(mds_db, uuid, fs_name)
1809 ############################################################
1810 # routing ("rooting")
1812 # list of (nettype, cluster_id, nid)
1815 def find_local_clusters(node_db):
1816 global local_clusters
1817 for netuuid in node_db.get_networks():
1818 net = node_db.lookup(netuuid)
1820 debug("add_local", netuuid)
1821 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
1823 if acceptors.has_key(srv.port):
1824 panic("duplicate port:", srv.port)
1825 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1826 srv.send_mem, srv.recv_mem,
1830 # This node is a gateway.
1832 def node_is_router():
1835 # If there are any routers found in the config, then this will be true
1836 # and all nodes will load kptlrouter.
1838 def node_needs_router():
1839 return needs_router or is_router
1841 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1842 # Currently, these local routes are only added to kptlrouter route
1843 # table if they are needed to connect to a specific server. This
1844 # should be changed so all available routes are loaded, and the
1845 # ptlrouter can make all the decisions.
1848 def find_local_routes(lustre):
1849 """ Scan the lustre config looking for routers . Build list of
1851 global local_routes, needs_router
1853 list = lustre.lookup_class('node')
1855 if router.get_val_int('router', 0):
1857 for (local_type, local_cluster_id, local_nid) in local_clusters:
1859 for netuuid in router.get_networks():
1860 db = router.lookup(netuuid)
1861 if (local_type == db.get_val('nettype') and
1862 local_cluster_id == db.get_val('clusterid')):
1863 gw = db.get_val('nid')
1866 debug("find_local_routes: gw is", gw)
1867 for route in router.get_local_routes(local_type, gw):
1868 local_routes.append(route)
1869 debug("find_local_routes:", local_routes)
1872 def choose_local_server(srv_list):
1873 for srv in srv_list:
1874 if local_cluster(srv.net_type, srv.cluster_id):
1877 def local_cluster(net_type, cluster_id):
1878 for cluster in local_clusters:
1879 if net_type == cluster[0] and cluster_id == cluster[1]:
1883 def local_interface(net_type, cluster_id, nid):
1884 for cluster in local_clusters:
1885 if (net_type == cluster[0] and cluster_id == cluster[1]
1886 and nid == cluster[2]):
1890 def find_route(srv_list):
1891 frm_type = local_clusters[0][0]
1892 for srv in srv_list:
1893 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
1894 to_type = srv.net_type
1896 cluster_id = srv.cluster_id
1897 debug ('looking for route to', to_type, to)
1898 for r in local_routes:
1899 debug("find_route: ", r)
1900 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
1904 def get_active_target(db):
1905 target_uuid = db.getUUID()
1906 target_name = db.getName()
1907 node_name = get_select(target_name)
1909 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
1911 tgt_dev_uuid = db.get_first_ref('active')
1915 ############################################################
1919 type = db.get_class()
1920 debug('Service:', type, db.getName(), db.getUUID())
1925 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
1926 elif type == 'network':
1928 elif type == 'routetbl':
1932 elif type == 'cobd':
1934 elif type == 'mdsdev':
1936 elif type == 'mountpoint':
1938 elif type == 'echoclient':
1940 elif type == 'mgmt':
1943 panic ("unknown service type:", type)
1947 # Prepare the system to run lustre using a particular profile
1948 # in a the configuration.
1949 # * load & the modules
1950 # * setup networking for the current node
1951 # * make sure partitions are in place and prepared
1952 # * initialize devices with lctl
1953 # Levels is important, and needs to be enforced.
1954 def for_each_profile(db, prof_list, operation):
1955 for prof_uuid in prof_list:
1956 prof_db = db.lookup(prof_uuid)
1958 panic("profile:", profile, "not found.")
1959 services = getServices(prof_db)
1962 def doSetup(services):
1966 n = newService(s[1])
1969 def doModules(services):
1973 n = newService(s[1])
1976 def doCleanup(services):
1981 n = newService(s[1])
1982 if n.safe_to_clean():
1985 def doUnloadModules(services):
1990 n = newService(s[1])
1991 if n.safe_to_clean_modules():
1996 def doHost(lustreDB, hosts):
2000 node_db = lustreDB.lookup_name(h, 'node')
2004 print 'No host entry found.'
2007 is_router = node_db.get_val_int('router', 0)
2008 lustre_upcall = node_db.get_val('lustreUpcall', '')
2009 portals_upcall = node_db.get_val('portalsUpcall', '')
2010 timeout = node_db.get_val_int('timeout', 0)
2012 find_local_clusters(node_db)
2014 find_local_routes(lustreDB)
2016 # Two step process: (1) load modules, (2) setup lustre
2017 # if not cleaning, load modules first.
2018 prof_list = node_db.get_refs('profile')
2021 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2022 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2023 "--client_uuid <UUID> --conn_uuid <UUID>")
2024 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2026 elif config.cleanup:
2028 # the command line can override this value
2030 # ugly hack, only need to run lctl commands for --dump
2031 if config.lctl_dump:
2032 for_each_profile(node_db, prof_list, doCleanup)
2035 sys_set_timeout(timeout)
2038 sys_set_lustre_upcall(lustre_upcall)
2039 sys_set_portals_upcall(portals_upcall)
2041 for_each_profile(node_db, prof_list, doCleanup)
2042 for_each_profile(node_db, prof_list, doUnloadModules)
2045 # ugly hack, only need to run lctl commands for --dump
2046 if config.lctl_dump:
2047 for_each_profile(node_db, prof_list, doSetup)
2051 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2052 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2054 for_each_profile(node_db, prof_list, doModules)
2056 sys_set_debug_path()
2059 script = config.gdb_script
2060 run(lctl.lctl, ' modules >', script)
2062 log ("The GDB module script is in", script)
2063 # pause, so user has time to break and
2066 sys_set_timeout(timeout)
2067 sys_set_lustre_upcall(lustre_upcall)
2068 sys_set_portals_upcall(portals_upcall)
2070 for_each_profile(node_db, prof_list, doSetup)
2072 def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid):
2073 tgt = db.lookup(tgt_uuid)
2075 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2076 new_uuid = get_active_target(tgt)
2078 raise Lustre.LconfError("doRecovery: no active target found for: " +
2080 net = choose_local_server(get_ost_net(db, new_uuid))
2082 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2083 # XXX, better to do a full disconnect here
2084 log("Reconnecting", tgt_uuid, " to ", net.uuid);
2085 lctl.del_uuid(conn_uuid)
2087 lctl.recover(client_uuid, net.uuid)
2090 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2091 base = os.path.dirname(cmd)
2092 if development_mode():
2093 if not config.lustre:
2094 config.lustre = (os.path.join(base, ".."))
2095 # normalize the portals dir, using command line arg if set
2097 portals_dir = config.portals
2098 dir = os.path.join(config.lustre, portals_dir)
2099 config.portals = dir
2100 debug('config.portals', config.portals)
2101 elif config.lustre and config.portals:
2103 # if --lustre and --portals, normalize portals
2104 # can ignore POTRALS_DIR here, since it is probly useless here
2105 config.portals = os.path.join(config.lustre, config.portals)
2106 debug('config.portals B', config.portals)
2108 def sysctl(path, val):
2109 debug("+ sysctl", path, val)
2113 fp = open(os.path.join('/proc/sys', path), 'w')
2120 def sys_set_debug_path():
2121 sysctl('portals/debug_path', config.debug_path)
2123 def sys_set_lustre_upcall(upcall):
2124 # the command overrides the value in the node config
2125 if config.lustre_upcall:
2126 upcall = config.lustre_upcall
2128 upcall = config.upcall
2130 sysctl('lustre/upcall', upcall)
2132 def sys_set_portals_upcall(upcall):
2133 # the command overrides the value in the node config
2134 if config.portals_upcall:
2135 upcall = config.portals_upcall
2137 upcall = config.upcall
2139 sysctl('portals/upcall', upcall)
2141 def sys_set_timeout(timeout):
2142 # the command overrides the value in the node config
2143 if config.timeout > 0:
2144 timeout = config.timeout
2145 if timeout != None and timeout > 0:
2146 sysctl('lustre/timeout', timeout)
2148 def sys_set_ptldebug():
2149 if config.ptldebug != None:
2151 val = eval(config.ptldebug, ptldebug_names)
2152 val = "0x%x" % (val,)
2153 sysctl('portals/debug', val)
2154 except NameError, e:
2157 def sys_set_subsystem():
2158 if config.subsystem != None:
2160 val = eval(config.subsystem, subsystem_names)
2161 val = "0x%x" % (val,)
2162 sysctl('portals/subsystem_debug', val)
2163 except NameError, e:
2166 def sys_set_netmem_max(path, max):
2167 debug("setting", path, "to at least", max)
2175 fp = open(path, 'w')
2176 fp.write('%d\n' %(max))
2180 def sys_make_devices():
2181 if not os.access('/dev/portals', os.R_OK):
2182 run('mknod /dev/portals c 10 240')
2183 if not os.access('/dev/obd', os.R_OK):
2184 run('mknod /dev/obd c 10 241')
2187 # Add dir to the global PATH, if not already there.
2188 def add_to_path(new_dir):
2189 syspath = string.split(os.environ['PATH'], ':')
2190 if new_dir in syspath:
2192 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2194 def default_debug_path():
2195 path = '/tmp/lustre-log'
2196 if os.path.isdir('/r'):
2201 def default_gdb_script():
2202 script = '/tmp/ogdb'
2203 if os.path.isdir('/r'):
2204 return '/r' + script
2209 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2210 # ensure basic elements are in the system path
2211 def sanitise_path():
2212 for dir in DEFAULT_PATH:
2215 # global hack for the --select handling
2217 def init_select(arg):
2218 # arg = "service=nodeA,service2=nodeB"
2221 list = string.split(arg, ',')
2223 srv, node = string.split(entry, '=')
2224 tgt_select[srv] = node
2226 def get_select(srv):
2227 if tgt_select.has_key(srv):
2228 return tgt_select[srv]
2232 PARAM = Lustre.Options.PARAM
2233 INTPARAM = Lustre.Options.INTPARAM
2235 ('verbose,v', "Print system commands as they are run"),
2236 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2237 ('config', "Cluster config name used for LDAP query", PARAM),
2238 ('select', "service=nodeA,service2=nodeB ", PARAM),
2239 ('node', "Load config for <nodename>", PARAM),
2240 ('cleanup,d', "Cleans up config. (Shutdown)"),
2241 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2242 Lustre.Options.FLAG, 0),
2243 ('mds_ost_conn', "Open connections to OSTs on the MDS"),
2244 ('failover',"""Used to shut down without saving state.
2245 This will allow this node to "give up" a service to a
2246 another node for failover purposes. This will not
2247 be a clean shutdown.""",
2248 Lustre.Options.FLAG, 0),
2249 ('gdb', """Prints message after creating gdb module script
2250 and sleeps for 5 seconds."""),
2251 ('noexec,n', """Prints the commands and steps that will be run for a
2252 config without executing them. This can used to check if a
2253 config file is doing what it should be doing"""),
2254 ('nomod', "Skip load/unload module step."),
2255 ('nosetup', "Skip device setup/cleanup step."),
2256 ('reformat', "Reformat all devices (without question)"),
2257 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2259 ('minlevel', "Minimum level of services to configure/cleanup",
2261 ('maxlevel', """Maximum level of services to configure/cleanup
2262 Levels are aproximatly like:
2267 70 - mountpoint, echo_client, osc, mdc, lov""",
2269 ('lustre', """Base directory of lustre sources. This parameter will
2270 cause lconf to load modules from a source tree.""", PARAM),
2271 ('portals', """Portals source directory. If this is a relative path,
2272 then it is assumed to be relative to lustre. """, PARAM),
2273 ('timeout', "Set recovery timeout", PARAM),
2274 ('upcall', "Set both portals and lustre upcall script", PARAM),
2275 ('lustre_upcall', "Set lustre upcall script", PARAM),
2276 ('portals_upcall', "Set portals upcall script", PARAM),
2277 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2278 ('ptldebug', "Set the portals debug level", PARAM),
2279 ('subsystem', "Set the portals debug subsystem", PARAM),
2280 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2281 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2282 # Client recovery options
2283 ('recover', "Recover a device"),
2284 ('group', "The group of devices to configure or cleanup", PARAM),
2285 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2286 ('client_uuid', "The failed client (required for recovery)", PARAM),
2287 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2291 global lctl, config, toplevel
2293 # in the upcall this is set to SIG_IGN
2294 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2296 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2298 config, args = cl.parse(sys.argv[1:])
2299 except Lustre.OptionError, e:
2303 setupModulePath(sys.argv[0])
2305 host = socket.gethostname()
2307 # the PRNG is normally seeded with time(), which is not so good for starting
2308 # time-synchronized clusters
2309 input = open('/dev/urandom', 'r')
2311 print 'Unable to open /dev/urandom!'
2313 seed = input.read(32)
2319 init_select(config.select)
2322 if not os.access(args[0], os.R_OK):
2323 print 'File not found or readable:', args[0]
2326 dom = xml.dom.minidom.parse(args[0])
2328 panic("%s does not appear to be a config file." % (args[0]))
2329 sys.exit(1) # make sure to die here, even in debug mode.
2330 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2331 if not config.config:
2332 config.config = os.path.basename(args[0])# use full path?
2333 if config.config[-4:] == '.xml':
2334 config.config = config.config[:-4]
2335 elif config.ldapurl:
2336 if not config.config:
2337 panic("--ldapurl requires --config name")
2338 dn = "config=%s,fs=lustre" % (config.config)
2339 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2341 print 'Missing config file or ldap URL.'
2342 print 'see lconf --help for command summary'
2347 ver = db.get_version()
2349 panic("No version found in config data, please recreate.")
2350 if ver != Lustre.CONFIG_VERSION:
2351 panic("Config version", ver, "does not match lconf version",
2352 Lustre.CONFIG_VERSION)
2356 node_list.append(config.node)
2359 node_list.append(host)
2360 node_list.append('localhost')
2362 debug("configuring for host: ", node_list)
2365 config.debug_path = config.debug_path + '-' + host
2366 config.gdb_script = config.gdb_script + '-' + host
2368 lctl = LCTLInterface('lctl')
2370 if config.lctl_dump:
2371 lctl.use_save_file(config.lctl_dump)
2373 doHost(db, node_list)
2375 if __name__ == "__main__":
2378 except Lustre.LconfError, e:
2380 # traceback.print_exc(file=sys.stdout)
2382 except CommandError, e:
2386 if first_cleanup_error:
2387 sys.exit(first_cleanup_error)