3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 1048576
55 # Maximum number of devices to search for.
56 # (the /dev/loop* nodes need to be created beforehand)
57 MAX_LOOP_DEVICES = 256
58 PORTALS_DIR = 'portals'
61 # Please keep these in sync with the values in portals/kp30.h
73 "warning" : (1 << 10),
77 "portals" : (1 << 14),
79 "dlmtrace" : (1 << 16),
83 "rpctrace" : (1 << 20),
84 "vfstrace" : (1 << 21),
88 "undefined" : (0 << 24),
97 "ext2obd" : (9 << 24),
98 "portals" : (10 << 24),
99 "socknal" : (11 << 24),
100 "qswnal" : (12 << 24),
101 "pinger" : (13 << 24),
102 "filter" : (14 << 24),
103 "trace" : (15 << 24),
107 "gmnal" : (19 << 24),
108 "ptlrouter" : (20 << 24),
110 "ptlbd" : (22 << 24),
116 first_cleanup_error = 0
117 def cleanup_error(rc):
118 global first_cleanup_error
119 if not first_cleanup_error:
120 first_cleanup_error = rc
122 # ============================================================
123 # debugging and error funcs
125 def fixme(msg = "this feature"):
126 raise Lustre.LconfError, msg + ' not implmemented yet.'
129 msg = string.join(map(str,args))
130 if not config.noexec:
131 raise Lustre.LconfError(msg)
136 msg = string.join(map(str,args))
141 print string.strip(s)
145 msg = string.join(map(str,args))
149 # ack, python's builtin int() does not support '0x123' syntax.
150 # eval can do it, although what a hack!
154 return eval(s, {}, {})
157 except SyntaxError, e:
158 raise ValueError("not a number")
160 raise ValueError("not a number")
162 # ============================================================
163 # locally defined exceptions
164 class CommandError (exceptions.Exception):
165 def __init__(self, cmd_name, cmd_err, rc=None):
166 self.cmd_name = cmd_name
167 self.cmd_err = cmd_err
172 if type(self.cmd_err) == types.StringType:
174 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
176 print "! %s: %s" % (self.cmd_name, self.cmd_err)
177 elif type(self.cmd_err) == types.ListType:
179 print "! %s (error %d):" % (self.cmd_name, self.rc)
181 print "! %s:" % (self.cmd_name)
182 for s in self.cmd_err:
183 print "> %s" %(string.strip(s))
188 # ============================================================
189 # handle daemons, like the acceptor
191 """ Manage starting and stopping a daemon. Assumes daemon manages
192 it's own pid file. """
194 def __init__(self, cmd):
200 log(self.command, "already running.")
202 self.path = find_prog(self.command)
204 panic(self.command, "not found.")
205 ret, out = runcmd(self.path +' '+ self.command_line())
207 raise CommandError(self.path, out, ret)
211 pid = self.read_pidfile()
213 log ("killing process", pid)
215 #time.sleep(1) # let daemon die
217 log("unable to kill", self.command, e)
219 log("unable to kill", self.command)
222 pid = self.read_pidfile()
232 def read_pidfile(self):
234 fp = open(self.pidfile(), 'r')
241 def clean_pidfile(self):
242 """ Remove a stale pidfile """
243 log("removing stale pidfile:", self.pidfile())
245 os.unlink(self.pidfile())
247 log(self.pidfile(), e)
249 class AcceptorHandler(DaemonHandler):
250 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg):
251 DaemonHandler.__init__(self, "acceptor")
254 self.send_mem = send_mem
255 self.recv_mem = recv_mem
257 if net_type == 'toe':
258 self.flags = self.flags + ' -N 4'
260 self.flags = self.flags + ' -i'
262 self.flags = self.flags + ' -x'
265 return "/var/run/%s-%d.pid" % (self.command, self.port)
267 def command_line(self):
268 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
272 # start the acceptors
274 for port in acceptors.keys():
275 daemon = acceptors[port]
276 if not daemon.running():
279 def run_one_acceptor(port):
280 if acceptors.has_key(port):
281 daemon = acceptors[port]
282 if not daemon.running():
285 panic("run_one_acceptor: No acceptor defined for port:", port)
287 def stop_acceptor(port):
288 if acceptors.has_key(port):
289 daemon = acceptors[port]
294 # ============================================================
295 # handle lctl interface
298 Manage communication with lctl
301 def __init__(self, cmd):
303 Initialize close by finding the lctl binary.
305 self.lctl = find_prog(cmd)
309 debug('! lctl not found')
312 raise CommandError('lctl', "unable to find lctl binary.")
314 def use_save_file(self, file):
315 self.save_file = file
317 def set_nonblock(self, fd):
318 fl = fcntl.fcntl(fd, F_GETFL)
319 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
324 the cmds are written to stdin of lctl
325 lctl doesn't return errors when run in script mode, so
327 should modify command line to accept multiple commands, or
328 create complex command line options
332 cmds = '\n dump ' + self.save_file + cmds
334 debug("+", cmd_line, cmds)
335 if config.noexec: return (0, [])
337 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
338 child.tochild.write(cmds + "\n")
339 child.tochild.close()
341 # From "Python Cookbook" from O'Reilly
342 outfile = child.fromchild
343 outfd = outfile.fileno()
344 self.set_nonblock(outfd)
345 errfile = child.childerr
346 errfd = errfile.fileno()
347 self.set_nonblock(errfd)
349 outdata = errdata = ''
352 ready = select.select([outfd,errfd],[],[]) # Wait for input
353 if outfd in ready[0]:
354 outchunk = outfile.read()
355 if outchunk == '': outeof = 1
356 outdata = outdata + outchunk
357 if errfd in ready[0]:
358 errchunk = errfile.read()
359 if errchunk == '': erreof = 1
360 errdata = errdata + errchunk
361 if outeof and erreof: break
362 # end of "borrowed" code
365 if os.WIFEXITED(ret):
366 rc = os.WEXITSTATUS(ret)
369 if rc or len(errdata):
370 raise CommandError(self.lctl, errdata, rc)
373 def runcmd(self, *args):
375 run lctl using the command line
377 cmd = string.join(map(str,args))
378 debug("+", self.lctl, cmd)
379 rc, out = run(self.lctl, cmd)
381 raise CommandError(self.lctl, out, rc)
385 def network(self, net, nid):
386 """ initialized network and add "self" """
390 quit """ % (net, nid)
393 # create a new connection
394 def connect(self, srv):
395 cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
396 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
406 add_autoconn %s %s %d %s""" % (cmds, srv.net_type,
409 srv.nid, srv.hostaddr, srv.port, flags )
411 cmds = cmds + "\n quit"
415 def recover(self, dev_name, new_conn):
419 recover %s""" %(dev_name, new_conn)
422 # add a route to a range
423 def add_route(self, net, gw, lo, hi):
431 except CommandError, e:
435 def del_route(self, net, gw, lo, hi):
443 # add a route to a host
444 def add_route_host(self, net, uuid, gw, tgt):
454 except CommandError, e:
458 # add a route to a range
459 def del_route_host(self, net, uuid, gw, tgt):
465 quit """ % (net, uuid, tgt)
468 # disconnect one connection
469 def disconnect(self, srv):
470 cmds = " ignore_errors\n del_uuid %s" % (srv.uuid)
471 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
474 del_autoconn %s %s s""" % (cmds,
476 srv.nid, srv.hostaddr)
477 cmds = cmds + "\n quit"
481 def del_uuid(self, servuuid):
485 quit""" % (servuuid,)
489 def disconnectAll(self, net):
497 # create a new device with lctl
498 def newdev(self, attach, setup = ""):
503 quit""" % (attach, setup)
507 def cleanup(self, name, uuid, force, failover = 0):
508 if failover: force = 1
514 quit""" % (name, ('', 'force')[force],
515 ('', 'failover')[failover])
519 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
524 lov_setconfig %s %d %d %d %s %s
525 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
529 def dump(self, dump_file):
532 quit""" % (dump_file)
535 # get list of devices
536 def device_list(self):
538 rc, out = self.runcmd('device_list')
539 except CommandError, e:
547 def lustre_version(self):
548 rc, out = self.runcmd('version')
552 def mount_option(self, option):
557 # ============================================================
558 # Various system-level functions
559 # (ideally moved to their own module)
561 # Run a command and return the output and status.
562 # stderr is sent to /dev/null, could use popen3 to
563 # save it if necessary
566 if config.noexec: return (0, [])
567 f = os.popen(cmd + ' 2>&1')
577 cmd = string.join(map(str,args))
580 # Run a command in the background.
581 def run_daemon(*args):
582 cmd = string.join(map(str,args))
584 if config.noexec: return 0
585 f = os.popen(cmd + ' 2>&1')
593 # Determine full path to use for an external command
594 # searches dirname(argv[0]) first, then PATH
596 syspath = string.split(os.environ['PATH'], ':')
597 cmdpath = os.path.dirname(sys.argv[0])
598 syspath.insert(0, cmdpath);
600 syspath.insert(0, os.path.join(config.portals, 'utils/'))
602 prog = os.path.join(d,cmd)
603 if os.access(prog, os.X_OK):
607 # Recursively look for file starting at base dir
608 def do_find_file(base, mod):
609 fullname = os.path.join(base, mod)
610 if os.access(fullname, os.R_OK):
612 for d in os.listdir(base):
613 dir = os.path.join(base,d)
614 if os.path.isdir(dir):
615 module = do_find_file(dir, mod)
619 def find_module(src_dir, dev_dir, modname):
620 mod = '%s.o' % (modname)
621 module = src_dir +'/'+ dev_dir +'/'+ mod
623 if os.access(module, os.R_OK):
629 # is the path a block device?
636 return stat.S_ISBLK(s[stat.ST_MODE])
638 # build fs according to type
640 def mkfs(dev, devsize, fstype,jsize):
645 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
647 # devsize is in 1k, and fs block count is in 4k
648 block_cnt = devsize/4
650 if fstype in ('ext3', 'extN'):
651 # ext3 journal size is in megabytes
652 if jsize: jopt = "-J size=%d" %(jsize,)
653 mkfs = 'mkfs.ext2 -j -b 4096 -F '
654 elif fstype == 'reiserfs':
655 # reiserfs journal size is in blocks
656 if jsize: jopt = "--journal_size %d" %(jsize,)
657 mkfs = 'mkreiserfs -ff'
659 print 'unsupported fs type: ', fstype
661 (ret, out) = run (mkfs, jopt, dev, block_cnt)
663 panic("Unable to build fs:", dev, string.join(out))
664 # enable hash tree indexing on fsswe
665 if fstype in ('ext3', 'extN'):
666 htree = 'echo "feature FEATURE_C5" | debugfs -w'
667 (ret, out) = run (htree, dev)
669 panic("Unable to enable htree:", dev)
671 # some systems use /dev/loopN, some /dev/loop/N
675 if not os.access(loop + str(0), os.R_OK):
677 if not os.access(loop + str(0), os.R_OK):
678 panic ("can't access loop devices")
681 # find loop device assigned to thefile
684 for n in xrange(0, MAX_LOOP_DEVICES):
686 if os.access(dev, os.R_OK):
687 (stat, out) = run('losetup', dev)
688 if out and stat == 0:
689 m = re.search(r'\((.*)\)', out[0])
690 if m and file == m.group(1):
696 # create file if necessary and assign the first free loop device
697 def init_loop(file, size, fstype, journal_size):
698 dev = find_loop(file)
700 print 'WARNING file:', file, 'already mapped to', dev
702 if config.reformat or not os.access(file, os.R_OK | os.W_OK):
704 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
705 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
708 panic("Unable to create backing store:", file)
709 mkfs(file, size, fstype, journal_size)
712 # find next free loop
713 for n in xrange(0, MAX_LOOP_DEVICES):
715 if os.access(dev, os.R_OK):
716 (stat, out) = run('losetup', dev)
718 run('losetup', dev, file)
721 print "out of loop devices"
723 print "out of loop devices"
726 # undo loop assignment
727 def clean_loop(file):
728 dev = find_loop(file)
730 ret, out = run('losetup -d', dev)
732 log('unable to clean loop device:', dev, 'for file:', file)
735 # determine if dev is formatted as a <fstype> filesystem
736 def need_format(fstype, dev):
737 # FIXME don't know how to implement this
740 # initialize a block device if needed
741 def block_dev(dev, size, fstype, format, journal_size):
742 if config.noexec: return dev
743 if not is_block(dev):
744 dev = init_loop(dev, size, fstype, journal_size)
745 elif config.reformat or (need_format(fstype, dev) and format == 'yes'):
746 mkfs(dev, size, fstype, journal_size)
749 # panic("device:", dev,
750 # "not prepared, and autoformat is not set.\n",
751 # "Rerun with --reformat option to format ALL filesystems")
756 """lookup IP address for an interface"""
757 rc, out = run("/sbin/ifconfig", iface)
760 addr = string.split(out[1])[1]
761 ip = string.split(addr, ':')[1]
764 def get_local_nid(net_type, wildcard):
765 """Return the local nid."""
767 if os.access('/proc/elan/device0/position', os.R_OK):
768 local = get_local_address('elan', '*')
770 local = get_local_address(net_type, wildcard)
773 def get_local_address(net_type, wildcard):
774 """Return the local address for the network type."""
776 if net_type in ('tcp', 'toe'):
778 iface, star = string.split(wildcard, ':')
779 local = if2addr(iface)
781 panic ("unable to determine ip for:", wildcard)
783 host = socket.gethostname()
784 local = socket.gethostbyname(host)
785 elif net_type == 'elan':
786 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
788 fp = open('/proc/elan/device0/position', 'r')
789 lines = fp.readlines()
798 elif net_type == 'gm':
799 fixme("automatic local address for GM")
800 elif net_type == 'scimac':
801 scinode="/opt/scali/sbin/scinode"
802 if os.path.exists(scinode):
803 (rc,local) = run(scinode)
805 panic (scinode, " not found on node with scimac networking")
807 panic (scinode, " failed")
808 local=string.rstrip(local[0])
812 # XXX: instead of device_list, ask for $name and see what we get
813 def is_prepared(name):
814 """Return true if a device exists for the name"""
817 if config.noexec and config.cleanup:
820 # expect this format:
821 # 1 UP ldlm ldlm ldlm_UUID 2
822 out = lctl.device_list()
824 if name == string.split(s)[3]:
826 except CommandError, e:
830 def is_network_prepared():
831 """If the LDLM device exists, then assume that all networking
832 has been configured"""
833 return is_prepared('ldlm')
835 def fs_is_mounted(path):
836 """Return true if path is a mounted lustre filesystem"""
838 fp = open('/proc/mounts')
839 lines = fp.readlines()
843 if a[1] == path and a[2] == 'lustre_lite':
850 # ============================================================
851 # Classes to prepare and cleanup the various objects
854 """ Base class for the rest of the modules. The default cleanup method is
855 defined here, as well as some utilitiy funcs.
857 def __init__(self, module_name, db):
859 self.module_name = module_name
860 self.name = self.db.getName()
861 self.uuid = self.db.getUUID()
862 self.kmodule_list = []
866 def info(self, *args):
867 msg = string.join(map(str,args))
868 print self.module_name + ":", self.name, self.uuid, msg
871 """ default cleanup, used for most modules """
874 lctl.cleanup(self.name, self.uuid, config.force)
875 except CommandError, e:
876 log(self.module_name, "cleanup failed: ", self.name)
880 def add_portals_module(self, dev_dir, modname):
881 """Append a module to list of modules to load."""
882 self.kmodule_list.append((config.portals, dev_dir, modname))
884 def add_lustre_module(self, dev_dir, modname):
885 """Append a module to list of modules to load."""
886 self.kmodule_list.append((config.lustre, dev_dir, modname))
888 def mod_loaded(self, modname):
889 """Check if a module is already loaded. Look in /proc/modules for it."""
890 fp = open('/proc/modules')
891 lines = fp.readlines()
893 # please forgive my tired fingers for this one
894 ret = filter(lambda word, mod=modname: word == mod,
895 map(lambda line: string.split(line)[0], lines))
898 def load_module(self):
899 """Load all the modules in the list in the order they appear."""
900 for src_dir, dev_dir, mod in self.kmodule_list:
901 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
902 if self.mod_loaded(mod) and not config.noexec:
904 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
906 module = find_module(src_dir, dev_dir, mod)
908 panic('module not found:', mod)
909 (rc, out) = run('/sbin/insmod', module)
911 raise CommandError('insmod', out, rc)
913 (rc, out) = run('/sbin/modprobe', mod)
915 raise CommandError('modprobe', out, rc)
917 def cleanup_module(self):
918 """Unload the modules in the list in reverse order."""
919 if not self.safe_to_clean():
921 rev = self.kmodule_list
923 for src_dir, dev_dir, mod in rev:
924 if not self.mod_loaded(mod) and not config.noexec:
927 if mod == 'portals' and config.dump:
928 lctl.dump(config.dump)
929 log('unloading module:', mod)
930 (rc, out) = run('/sbin/rmmod', mod)
932 log('! unable to unload module:', mod)
935 def safe_to_clean(self):
938 def safe_to_clean_modules(self):
939 return self.safe_to_clean()
941 class Network(Module):
942 def __init__(self,db):
943 Module.__init__(self, 'NETWORK', db)
944 self.net_type = self.db.get_val('nettype')
945 self.nid = self.db.get_val('nid', '*')
946 self.cluster_id = self.db.get_val('clusterid', "0")
947 self.port = self.db.get_val_int('port', 0)
948 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
949 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
950 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
951 self.nid_exchange = self.db.get_val_int('nidexchange', 0)
954 if self.nid_exchange:
955 self.nid = get_local_nid(self.net_type, self.nid)
957 self.nid = get_local_address(self.net_type, self.nid)
959 panic("unable to set nid for", self.net_type, self.nid)
960 debug("nid:", self.nid)
962 self.hostaddr = self.db.get_val('hostaddr', self.nid)
963 if '*' in self.hostaddr:
964 self.hostaddr = get_local_address(self.net_type, self.hostaddr)
966 panic("unable to set nid for", self.net_type, self.hostaddr)
967 debug("hostaddr:", self.hostaddr)
969 self.add_portals_module("libcfs", 'portals')
970 if node_needs_router():
971 self.add_portals_module("router", 'kptlrouter')
972 if self.net_type == 'tcp':
973 self.add_portals_module("knals/socknal", 'ksocknal')
974 if self.net_type == 'toe':
975 self.add_portals_module("knals/toenal", 'ktoenal')
976 if self.net_type == 'elan':
977 self.add_portals_module("knals/qswnal", 'kqswnal')
978 if self.net_type == 'gm':
979 self.add_portals_module("knals/gmnal", 'kgmnal')
980 if self.net_type == 'scimac':
981 self.add_portals_module("knals/scimacnal", 'kscimacnal')
984 if is_network_prepared():
986 self.info(self.net_type, self.nid, self.port)
987 lctl.network(self.net_type, self.nid)
988 if self.port and node_is_router():
989 run_one_acceptor(self.port)
990 self.connect_peer_gateways()
992 def connect_peer_gateways(self):
993 for router in self.db.lookup_class('node'):
994 if router.get_val_int('router', 0):
995 for netuuid in router.get_networks():
996 net = self.db.lookup(netuuid)
998 if (gw.cluster_id == self.cluster_id and
999 gw.net_type == self.net_type):
1000 # hack: compare as numbers if possible, this should all
1001 # go away once autoconnect is done.
1002 # This also conveniently prevents us from connecting to ourself.
1004 gw_nid = my_int(gw.nid)
1005 self_nid = my_int(self.nid)
1006 except ValueError, e:
1009 if gw_nid != self_nid:
1012 def disconnect_peer_gateways(self):
1013 for router in self.db.lookup_class('node'):
1014 if router.get_val_int('router', 0):
1015 for netuuid in router.get_networks():
1016 net = self.db.lookup(netuuid)
1018 if (gw.cluster_id == self.cluster_id and
1019 gw.net_type == self.net_type):
1020 # hack: compare as numbers if possible, this should all
1021 # go away once autoconnect is done.
1022 # This also conveniently prevents us from connecting to ourself.
1024 gw_nid = my_int(gw.nid)
1025 self_nid = my_int(self.nid)
1026 except ValueError, e:
1029 if gw_nid != self_nid:
1032 except CommandError, e:
1033 print "disconnect failed: ", self.name
1037 def safe_to_clean(self):
1038 return not is_network_prepared()
1041 self.info(self.net_type, self.nid, self.port)
1043 stop_acceptor(self.port)
1044 if node_is_router():
1045 self.disconnect_peer_gateways()
1047 # This commented out so connections not created by this
1048 # config are not disturbed
1051 # lctl.disconnectAll(self.net_type)
1052 # except CommandError, e:
1053 # print "disconnectAll failed: ", self.name
1055 # cleanup_error(e.rc)
1057 class RouteTable(Module):
1058 def __init__(self,db):
1059 Module.__init__(self, 'ROUTES', db)
1061 if is_network_prepared():
1064 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1065 lctl.add_route(net_type, gw, lo, hi)
1066 if net_type in ('tcp', 'toe') and local_net_type(net_type, tgt_cluster_id) and lo == hi:
1067 srvdb = self.db.nid2server(lo, net_type)
1069 panic("no server for nid", lo)
1071 srv = Network(srvdb)
1074 def safe_to_clean(self):
1075 return not is_network_prepared()
1078 if is_network_prepared():
1079 # the network is still being used, don't clean it up
1081 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1082 if net_type in ('tcp', 'toe') and local_net_type(net_type, tgt_cluster_id) and lo == hi:
1083 srvdb = self.db.nid2server(lo, net_type)
1085 panic("no server for nid", lo)
1087 srv = Network(srvdb)
1089 lctl.disconnect(srv)
1090 except CommandError, e:
1091 print "disconnect failed: ", self.name
1095 lctl.del_route(net_type, gw, lo, hi)
1096 except CommandError, e:
1097 print "del_route failed: ", self.name
1101 class Management(Module):
1102 def __init__(self, db):
1103 Module.__init__(self, 'MGMT', db)
1104 self.add_lustre_module('obdclass', 'obdclass')
1105 self.add_lustre_module('ptlrpc', 'ptlrpc')
1106 self.add_lustre_module('ldlm', 'ldlm')
1107 self.add_lustre_module('mgmt', 'mgmt_svc')
1110 if is_prepared(self.name):
1113 lctl.newdev(attach="mgmt %s %s" % (self.name, self.uuid))
1115 def safe_to_clean(self):
1119 if is_prepared(self.name):
1120 Module.cleanup(self)
1123 def __init__(self,db):
1124 Module.__init__(self, 'LDLM', db)
1125 self.add_lustre_module('obdclass', 'obdclass')
1126 self.add_lustre_module('ptlrpc', 'ptlrpc')
1127 self.add_lustre_module('ldlm', 'ldlm')
1130 if is_prepared(self.name):
1133 lctl.newdev(attach="ldlm %s %s" % ('ldlm', 'ldlm_UUID'))
1135 def safe_to_clean(self):
1136 out = lctl.device_list()
1137 return len(out) <= 1
1140 if is_prepared(self.name):
1141 Module.cleanup(self)
1144 def __init__(self, db, uuid, fs_name):
1145 Module.__init__(self, 'LOV', db)
1146 self.add_lustre_module('mdc', 'mdc')
1147 self.add_lustre_module('lov', 'lov')
1148 self.mds_uuid = self.db.get_first_ref('mds')
1149 mds= self.db.lookup(self.mds_uuid)
1150 self.mds_name = mds.getName()
1151 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1152 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1153 self.pattern = self.db.get_val_int('stripepattern', 0)
1154 self.devlist = self.db.get_refs('obd')
1155 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1157 self.client_uuid = generate_client_uuid(self.name)
1158 self.fs_name = fs_name
1160 self.mdc = get_mdc(db, self.client_uuid, fs_name, self.mds_uuid)
1161 for obd_uuid in self.devlist:
1162 obd = self.db.lookup(obd_uuid)
1163 osc = get_osc(obd, self.client_uuid, fs_name)
1165 self.osclist.append(osc)
1167 panic('osc not found:', obd_uuid)
1170 if is_prepared(self.name):
1172 for osc in self.osclist:
1174 # Only ignore connect failures with --force, which
1175 # isn't implemented here yet.
1176 osc.prepare(ignore_connect_failure=0)
1177 except CommandError, e:
1178 print "Error preparing OSC %s\n" % osc.uuid
1181 self.mdc_name = self.mdc.name
1182 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1183 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1184 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
1185 setup ="%s" % (self.mdc_name))
1188 if is_prepared(self.name):
1189 Module.cleanup(self)
1190 for osc in self.osclist:
1192 mdc = get_mdc(self.db, self.client_uuid, self.fs_name, self.mds_uuid)
1195 def load_module(self):
1196 for osc in self.osclist:
1199 Module.load_module(self)
1201 def cleanup_module(self):
1202 Module.cleanup_module(self)
1203 for osc in self.osclist:
1204 osc.cleanup_module()
1207 class LOVConfig(Module):
1208 def __init__(self, db):
1209 Module.__init__(self, 'LOVConfig', db)
1211 self.lov_uuid = self.db.get_first_ref('lov')
1212 l = self.db.lookup(self.lov_uuid)
1213 self.lov = LOV(l, "YOU_SHOULD_NEVER_SEE_THIS_UUID", '')
1217 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
1218 lov.pattern, lov.devlist, lov.mds_name)
1219 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
1220 lov.stripe_sz, lov.stripe_off, lov.pattern,
1221 string.join(lov.devlist))
1227 class MDSDEV(Module):
1228 def __init__(self,db):
1229 Module.__init__(self, 'MDSDEV', db)
1230 self.devpath = self.db.get_val('devpath','')
1231 self.size = self.db.get_val_int('devsize', 0)
1232 self.journal_size = self.db.get_val_int('journalsize', 0)
1233 self.fstype = self.db.get_val('fstype', '')
1234 self.nspath = self.db.get_val('nspath', '')
1235 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1236 target_uuid = self.db.get_first_ref('target')
1237 mds = self.db.lookup(target_uuid)
1238 self.name = mds.getName()
1239 self.lovconfig_uuids = mds.get_refs('lovconfig')
1240 self.filesystem_uuids = mds.get_refs('filesystem')
1241 # FIXME: if fstype not set, then determine based on kernel version
1242 self.format = self.db.get_val('autoformat', "no")
1243 if mds.get_val('failover', 0):
1244 self.failover_mds = 'f'
1246 self.failover_mds = 'n'
1247 active_uuid = get_active_target(mds)
1249 panic("No target device found:", target_uuid)
1250 if active_uuid == self.uuid:
1254 if self.active and config.group and config.group != ost.get_val('group'):
1257 self.target_dev_uuid = self.uuid
1258 self.uuid = target_uuid
1260 self.add_lustre_module('mds', 'mds')
1262 self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
1264 def load_module(self):
1266 Module.load_module(self)
1269 if is_prepared(self.name):
1272 debug(self.uuid, "not active")
1274 self.info(self.devpath, self.fstype, self.format)
1276 blkdev = block_dev(self.devpath, self.size, self.fstype, self.format,
1278 if not is_prepared('MDT'):
1279 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
1282 run ("mkdir", self.nspath)
1283 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
1284 setup ="%s %s %s" %(blkdev, self.fstype, self.nspath))
1285 for uuid in self.lovconfig_uuids:
1286 db = self.db.lookup(uuid)
1287 lovconfig = LOVConfig(db)
1289 if config.mds_ost_conn:
1290 for uuid in self.filesystem_uuids:
1291 log("open clients for filesystem:", uuid)
1292 fs = self.db.lookup(uuid)
1293 obd_uuid = fs.get_first_ref('obd')
1294 client_uuid = generate_client_uuid(self.name)
1295 client = VOSC(client_uuid, self.db.lookup(obd_uuid), self.name)
1299 def msd_remaining(self):
1300 out = lctl.device_list()
1302 if string.split(s)[2] in ('mds',):
1305 def safe_to_clean(self):
1308 def safe_to_clean_modules(self):
1309 return not self.msd_remaining()
1313 debug(self.uuid, "not active")
1315 if is_prepared(self.name):
1318 lctl.cleanup(self.name, self.uuid, config.force,
1320 except CommandError, e:
1321 log(self.module_name, "cleanup failed: ", self.name)
1324 Module.cleanup(self)
1325 if config.mds_ost_conn:
1326 for uuid in self.filesystem_uuids:
1327 log("clean clients for filesystem:", uuid)
1328 log("open clients for filesystem:", uuid)
1329 fs = self.db.lookup(uuid)
1330 obd_uuid = fs.get_first_ref('obd')
1331 client = VOSC(self.db.lookup(obd_uuid), self.name)
1333 if not self.msd_remaining() and is_prepared('MDT'):
1335 lctl.cleanup("MDT", "MDT_UUID", config.force,
1337 except CommandError, e:
1338 print "cleanup failed: ", self.name
1341 clean_loop(self.devpath)
1344 def __init__(self, db):
1345 Module.__init__(self, 'OSD', db)
1346 self.osdtype = self.db.get_val('osdtype')
1347 self.devpath = self.db.get_val('devpath', '')
1348 self.size = self.db.get_val_int('devsize', 0)
1349 self.journal_size = self.db.get_val_int('journalsize', 0)
1350 self.fstype = self.db.get_val('fstype', '')
1351 self.nspath = self.db.get_val('nspath', '')
1352 target_uuid = self.db.get_first_ref('target')
1353 ost = self.db.lookup(target_uuid)
1354 self.name = ost.getName()
1355 self.format = self.db.get_val('autoformat', 'yes')
1356 if ost.get_val('failover', 0):
1357 self.failover_ost = 'f'
1359 self.failover_ost = 'n'
1361 active_uuid = get_active_target(ost)
1363 panic("No target device found:", target_uuid)
1364 if active_uuid == self.uuid:
1368 if self.active and config.group and config.group != ost.get_val('group'):
1371 self.target_dev_uuid = self.uuid
1372 self.uuid = target_uuid
1374 self.add_lustre_module('ost', 'ost')
1375 # FIXME: should we default to ext3 here?
1377 self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
1378 self.add_lustre_module(self.osdtype, self.osdtype)
1380 def load_module(self):
1382 Module.load_module(self)
1384 # need to check /proc/mounts and /etc/mtab before
1385 # formatting anything.
1386 # FIXME: check if device is already formatted.
1388 if is_prepared(self.name):
1391 debug(self.uuid, "not active")
1393 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1394 self.format, self.journal_size)
1396 if self.osdtype == 'obdecho':
1399 blkdev = block_dev(self.devpath, self.size, self.fstype,
1400 self.format, self.journal_size)
1402 run ("mkdir", self.nspath)
1403 lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
1404 setup ="%s %s %s %s" %(blkdev, self.fstype,
1405 self.failover_ost, self.nspath))
1406 if not is_prepared('OSS'):
1407 lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
1410 def osd_remaining(self):
1411 out = lctl.device_list()
1413 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1416 def safe_to_clean(self):
1419 def safe_to_clean_modules(self):
1420 return not self.osd_remaining()
1424 debug(self.uuid, "not active")
1426 if is_prepared(self.name):
1429 lctl.cleanup(self.name, self.uuid, config.force,
1431 except CommandError, e:
1432 log(self.module_name, "cleanup failed: ", self.name)
1435 if not self.osd_remaining() and is_prepared('OSS'):
1437 lctl.cleanup("OSS", "OSS_UUID", config.force,
1439 except CommandError, e:
1440 print "cleanup failed: ", self.name
1443 if not self.osdtype == 'obdecho':
1444 clean_loop(self.devpath)
1446 def mgmt_uuid_for_fs(mtpt_name):
1449 mtpt_db = toplevel.lookup_name(mtpt_name)
1450 fs_uuid = mtpt_db.get_first_ref('filesystem')
1451 fs = toplevel.lookup(fs_uuid)
1454 return fs.get_first_ref('mgmt')
1456 # Generic client module, used by OSC and MDC
1457 class Client(Module):
1458 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1460 self.target_name = tgtdb.getName()
1461 self.target_uuid = tgtdb.getUUID()
1464 self.tgt_dev_uuid = get_active_target(tgtdb)
1465 if not self.tgt_dev_uuid:
1466 panic("No target device found for target:", self.target_name)
1468 self.kmodule_list = []
1472 self.module = module
1473 self.module_name = string.upper(module)
1475 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1476 self.target_name, fs_name)
1478 self.name = self_name
1480 self.lookup_server(self.tgt_dev_uuid)
1481 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1483 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1486 self.fs_name = fs_name
1489 self.add_lustre_module(module_dir, module)
1491 def lookup_server(self, srv_uuid):
1492 """ Lookup a server's network information """
1493 self._server_nets = get_ost_net(self.db, srv_uuid)
1494 if len(self._server_nets) == 0:
1495 panic ("Unable to find a server for:", srv_uuid)
1497 def get_servers(self):
1498 return self._server_nets
1500 def prepare(self, ignore_connect_failure = 0):
1501 self.info(self.target_uuid)
1502 if is_prepared(self.name):
1505 srv = choose_local_server(self.get_servers())
1509 srv, r = find_route(self.get_servers())
1511 lctl.add_route_host(r[0], srv.uuid, r[1], r[3])
1513 panic ("no route to", self.target_uuid)
1514 except CommandError, e:
1515 if not ignore_connect_failure:
1518 lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
1519 setup ="%s %s %s" % (self.target_uuid, srv.uuid,
1523 if is_prepared(self.name):
1524 Module.cleanup(self)
1526 srv = choose_local_server(self.get_servers())
1528 lctl.disconnect(srv)
1530 srv, r = find_route(self.get_servers())
1532 lctl.del_route_host(r[0], srv.uuid, r[1], r[3])
1533 except CommandError, e:
1534 log(self.module_name, "cleanup failed: ", self.name)
1540 def __init__(self, db, uuid, fs_name):
1541 Client.__init__(self, db, uuid, 'mdc', fs_name)
1545 def __init__(self, db, uuid, fs_name):
1546 Client.__init__(self, db, uuid, 'osc', fs_name)
1548 def mgmtcli_name_for_uuid(uuid):
1549 return 'MGMTCLI_%s' % uuid
1551 class ManagementClient(Client):
1552 def __init__(self, db, uuid):
1553 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1554 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1555 module_dir = 'mgmt')
1558 def __init__(self, db):
1559 Module.__init__(self, 'COBD', db)
1560 self.real_uuid = self.db.get_first_ref('realobd')
1561 self.cache_uuid = self.db.get_first_ref('cacheobd')
1562 self.add_lustre_module('cobd' , 'cobd')
1564 # need to check /proc/mounts and /etc/mtab before
1565 # formatting anything.
1566 # FIXME: check if device is already formatted.
1568 if is_prepared(self.name):
1570 self.info(self.real_uuid, self.cache_uuid)
1571 lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid),
1572 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1575 # virtual interface for OSC and LOV
1577 def __init__(self, db, uuid, fs_name):
1578 Module.__init__(self, 'VOSC', db)
1579 if db.get_class() == 'lov':
1580 self.osc = LOV(db, uuid, fs_name)
1582 self.osc = get_osc(db, uuid, fs_name)
1584 return self.osc.uuid
1586 return self.osc.name
1591 def load_module(self):
1592 self.osc.load_module()
1593 def cleanup_module(self):
1594 self.osc.cleanup_module()
1596 return self.db.get_class() != 'lov'
1597 def get_mdc_name(self):
1598 if self.db.get_class() == 'lov':
1599 return self.osc.mdc_name
1603 class ECHO_CLIENT(Module):
1604 def __init__(self,db):
1605 Module.__init__(self, 'ECHO_CLIENT', db)
1606 self.add_lustre_module('obdecho', 'obdecho')
1607 self.obd_uuid = self.db.get_first_ref('obd')
1608 obd = self.db.lookup(self.obd_uuid)
1609 self.osc = VOSC(obd, self.uuid, self.name)
1612 if is_prepared(self.name):
1614 self.osc.prepare() # XXX This is so cheating. -p
1615 self.info(self.obd_uuid)
1617 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1618 setup = self.osc.get_name())
1621 if is_prepared(self.name):
1622 Module.cleanup(self)
1625 def load_module(self):
1626 self.osc.load_module()
1627 Module.load_module(self)
1629 def cleanup_module(self):
1630 Module.cleanup_module(self)
1631 self.osc.cleanup_module()
1634 def generate_client_uuid(name):
1635 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1637 int(random.random() * 1048576),
1638 int(random.random() * 1048576))
1639 return client_uuid[:36]
1642 class Mountpoint(Module):
1643 def __init__(self,db):
1644 Module.__init__(self, 'MTPT', db)
1645 self.path = self.db.get_val('path')
1646 self.fs_uuid = self.db.get_first_ref('filesystem')
1647 fs = self.db.lookup(self.fs_uuid)
1648 self.mds_uuid = fs.get_first_ref('mds')
1649 self.obd_uuid = fs.get_first_ref('obd')
1650 self.mgmt_uuid = fs.get_first_ref('mgmt')
1651 obd = self.db.lookup(self.obd_uuid)
1652 client_uuid = generate_client_uuid(self.name)
1653 self.vosc = VOSC(obd, client_uuid, self.name)
1654 if self.vosc.need_mdc():
1655 self.add_lustre_module('mdc', 'mdc')
1656 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1657 self.add_lustre_module('llite', 'llite')
1659 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1665 if fs_is_mounted(self.path):
1666 log(self.path, "already mounted.")
1669 self.mgmtcli.prepare()
1671 if self.vosc.need_mdc():
1673 mdc_name = self.mdc.name
1675 mdc_name = self.vosc.get_mdc_name()
1678 panic("Unable to determine MDC name. Probably need to cleanup before re-mounting.")
1679 self.info(self.path, self.mds_uuid, self.obd_uuid)
1680 if config.lctl_dump:
1681 cmd = "osc=%s,mdc=%s" % (self.vosc.get_name(), mdc_name)
1682 lctl.mount_option(cmd)
1684 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1685 (self.vosc.get_name(), mdc_name, config.config, self.path)
1686 run("mkdir", self.path)
1690 if self.vosc.need_mdc():
1692 panic("mount failed:", self.path, ":", string.join(val))
1695 self.info(self.path, self.mds_uuid,self.obd_uuid)
1696 if fs_is_mounted(self.path):
1698 (rc, out) = run("umount", "-f", self.path)
1700 (rc, out) = run("umount", self.path)
1702 raise CommandError('umount', out, rc)
1704 if fs_is_mounted(self.path):
1705 panic("fs is still mounted:", self.path)
1708 if self.vosc.need_mdc():
1711 self.mgmtcli.cleanup()
1713 def load_module(self):
1715 self.mgmtcli.load_module()
1716 self.vosc.load_module()
1717 Module.load_module(self)
1719 def cleanup_module(self):
1720 Module.cleanup_module(self)
1721 self.vosc.cleanup_module()
1723 self.mgmtcli.cleanup_module()
1726 # ============================================================
1727 # misc query functions
1729 def get_ost_net(self, osd_uuid):
1733 osd = self.lookup(osd_uuid)
1734 node_uuid = osd.get_first_ref('node')
1735 node = self.lookup(node_uuid)
1737 panic("unable to find node for osd_uuid:", osd_uuid,
1738 " node_ref:", node_uuid)
1739 for net_uuid in node.get_networks():
1740 db = node.lookup(net_uuid)
1741 srv_list.append(Network(db))
1745 # the order of iniitailization is based on level.
1746 def getServiceLevel(self):
1747 type = self.get_class()
1749 if type in ('network',):
1751 elif type in ('routetbl',):
1753 elif type in ('ldlm',):
1755 elif type in ('mgmt',):
1757 elif type in ('osd', 'cobd'):
1759 elif type in ('mdsdev',):
1761 elif type in ('mountpoint', 'echoclient'):
1764 panic("Unknown type: ", type)
1766 if ret < config.minlevel or ret > config.maxlevel:
1771 # return list of services in a profile. list is a list of tuples
1772 # [(level, db_object),]
1773 def getServices(self):
1775 for ref_class, ref_uuid in self.get_all_refs():
1776 servdb = self.lookup(ref_uuid)
1778 level = getServiceLevel(servdb)
1780 list.append((level, servdb))
1782 panic('service not found: ' + ref_uuid)
1788 ############################################################
1790 # FIXME: clean this mess up!
1792 # OSC is no longer in the xml, so we have to fake it.
1793 # this is getting ugly and begging for another refactoring
1794 def get_osc(ost_db, uuid, fs_name):
1795 osc = OSC(ost_db, uuid, fs_name)
1798 def get_mdc(db, uuid, fs_name, mds_uuid):
1799 mds_db = db.lookup(mds_uuid);
1801 panic("no mds:", mds_uuid)
1802 mdc = MDC(mds_db, uuid, fs_name)
1805 ############################################################
1806 # routing ("rooting")
1808 # list of (nettype, cluster_id)
1811 def find_local_clusters(node_db):
1812 global local_clusters
1813 for netuuid in node_db.get_networks():
1814 net = node_db.lookup(netuuid)
1816 debug("add_local", netuuid)
1817 local_clusters.append((srv.net_type, srv.cluster_id))
1819 if acceptors.has_key(srv.port):
1820 panic("duplicate port:", srv.port)
1821 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1822 srv.send_mem, srv.recv_mem,
1826 # This node is a gateway.
1828 def node_is_router():
1831 # If there are any routers found in the config, then this will be true
1832 # and all nodes will load kptlrouter.
1834 def node_needs_router():
1835 return needs_router or is_router
1837 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1838 # Currently, these local routes are only added to kptlrouter route
1839 # table if they are needed to connect to a specific server. This
1840 # should be changed so all available routes are loaded, and the
1841 # ptlrouter can make all the decisions.
1844 def find_local_routes(lustre):
1845 """ Scan the lustre config looking for routers . Build list of
1847 global local_routes, needs_router
1849 list = lustre.lookup_class('node')
1851 if router.get_val_int('router', 0):
1853 for (local_type, local_cluster_id) in local_clusters:
1855 for netuuid in router.get_networks():
1856 db = router.lookup(netuuid)
1857 if (local_type == db.get_val('nettype') and
1858 local_cluster_id == db.get_val('clusterid')):
1859 gw = db.get_val('nid')
1862 debug("find_local_routes: gw is", gw)
1863 for route in router.get_local_routes(local_type, gw):
1864 local_routes.append(route)
1865 debug("find_local_routes:", local_routes)
1868 def choose_local_server(srv_list):
1869 for srv in srv_list:
1870 if local_net_type(srv.net_type, srv.cluster_id):
1873 def local_net_type(net_type, cluster_id):
1874 for cluster in local_clusters:
1875 if net_type == cluster[0] and cluster_id == cluster[1]:
1879 def find_route(srv_list):
1880 frm_type = local_clusters[0][0]
1881 for srv in srv_list:
1882 debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
1883 to_type = srv.net_type
1884 to = srv.hostaddr # XXX should this be hostaddr, or nid?
1885 cluster_id = srv.cluster_id
1886 debug ('looking for route to', to_type, to)
1887 for r in local_routes:
1888 debug("find_route: ", r)
1889 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
1893 def get_active_target(db):
1894 target_uuid = db.getUUID()
1895 target_name = db.getName()
1896 node_name = get_select(target_name)
1898 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
1900 tgt_dev_uuid = db.get_first_ref('active')
1904 ############################################################
1908 type = db.get_class()
1909 debug('Service:', type, db.getName(), db.getUUID())
1914 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
1915 elif type == 'network':
1917 elif type == 'routetbl':
1921 elif type == 'cobd':
1923 elif type == 'mdsdev':
1925 elif type == 'mountpoint':
1927 elif type == 'echoclient':
1929 elif type == 'mgmt':
1932 panic ("unknown service type:", type)
1936 # Prepare the system to run lustre using a particular profile
1937 # in a the configuration.
1938 # * load & the modules
1939 # * setup networking for the current node
1940 # * make sure partitions are in place and prepared
1941 # * initialize devices with lctl
1942 # Levels is important, and needs to be enforced.
1943 def for_each_profile(db, prof_list, operation):
1944 for prof_uuid in prof_list:
1945 prof_db = db.lookup(prof_uuid)
1947 panic("profile:", profile, "not found.")
1948 services = getServices(prof_db)
1951 def doSetup(services):
1955 n = newService(s[1])
1958 def doModules(services):
1962 n = newService(s[1])
1965 def doCleanup(services):
1970 n = newService(s[1])
1971 if n.safe_to_clean():
1974 def doUnloadModules(services):
1979 n = newService(s[1])
1980 if n.safe_to_clean_modules():
1985 def doHost(lustreDB, hosts):
1989 node_db = lustreDB.lookup_name(h, 'node')
1993 print 'No host entry found.'
1996 is_router = node_db.get_val_int('router', 0)
1997 lustre_upcall = node_db.get_val('lustreUpcall', '')
1998 portals_upcall = node_db.get_val('portalsUpcall', '')
1999 timeout = node_db.get_val_int('timeout', 0)
2001 find_local_clusters(node_db)
2003 find_local_routes(lustreDB)
2005 # Two step process: (1) load modules, (2) setup lustre
2006 # if not cleaning, load modules first.
2007 prof_list = node_db.get_refs('profile')
2010 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2011 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2012 "--client_uuid <UUID> --conn_uuid <UUID>")
2013 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2015 elif config.cleanup:
2017 # the command line can override this value
2019 # ugly hack, only need to run lctl commands for --dump
2020 if config.lctl_dump:
2021 for_each_profile(node_db, prof_list, doCleanup)
2024 sys_set_timeout(timeout)
2027 sys_set_lustre_upcall(lustre_upcall)
2028 sys_set_portals_upcall(portals_upcall)
2030 for_each_profile(node_db, prof_list, doCleanup)
2031 for_each_profile(node_db, prof_list, doUnloadModules)
2034 # ugly hack, only need to run lctl commands for --dump
2035 if config.lctl_dump:
2036 for_each_profile(node_db, prof_list, doSetup)
2040 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2041 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2043 for_each_profile(node_db, prof_list, doModules)
2045 sys_set_debug_path()
2048 script = config.gdb_script
2049 run(lctl.lctl, ' modules >', script)
2051 log ("The GDB module script is in", script)
2052 # pause, so user has time to break and
2055 sys_set_timeout(timeout)
2056 sys_set_lustre_upcall(lustre_upcall)
2057 sys_set_portals_upcall(portals_upcall)
2059 for_each_profile(node_db, prof_list, doSetup)
2061 def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid):
2062 tgt = db.lookup(tgt_uuid)
2064 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2065 new_uuid = get_active_target(tgt)
2067 raise Lustre.LconfError("doRecovery: no active target found for: " +
2069 net = choose_local_server(get_ost_net(db, new_uuid))
2071 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2072 # XXX, better to do a full disconnect here
2073 log("Reconnecting", tgt_uuid, " to ", net.uuid);
2074 lctl.del_uuid(conn_uuid)
2076 lctl.recover(client_uuid, net.uuid)
2079 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2080 base = os.path.dirname(cmd)
2081 if development_mode():
2082 if not config.lustre:
2083 config.lustre = (os.path.join(base, ".."))
2084 # normalize the portals dir, using command line arg if set
2086 portals_dir = config.portals
2087 dir = os.path.join(config.lustre, portals_dir)
2088 config.portals = dir
2089 debug('config.portals', config.portals)
2090 elif config.lustre and config.portals:
2092 # if --lustre and --portals, normalize portals
2093 # can ignore POTRALS_DIR here, since it is probly useless here
2094 config.portals = os.path.join(config.lustre, config.portals)
2095 debug('config.portals B', config.portals)
2097 def sysctl(path, val):
2098 debug("+ sysctl", path, val)
2102 fp = open(os.path.join('/proc/sys', path), 'w')
2109 def sys_set_debug_path():
2110 sysctl('portals/debug_path', config.debug_path)
2112 def sys_set_lustre_upcall(upcall):
2113 # the command overrides the value in the node config
2114 if config.lustre_upcall:
2115 upcall = config.lustre_upcall
2117 upcall = config.upcall
2119 sysctl('lustre/upcall', upcall)
2121 def sys_set_portals_upcall(upcall):
2122 # the command overrides the value in the node config
2123 if config.portals_upcall:
2124 upcall = config.portals_upcall
2126 upcall = config.upcall
2128 sysctl('portals/upcall', upcall)
2130 def sys_set_timeout(timeout):
2131 # the command overrides the value in the node config
2132 if config.timeout > 0:
2133 timeout = config.timeout
2134 if timeout != None and timeout > 0:
2135 sysctl('lustre/timeout', timeout)
2137 def sys_set_ptldebug():
2138 if config.ptldebug != None:
2140 val = eval(config.ptldebug, ptldebug_names)
2141 val = "0x%x" % (val,)
2142 sysctl('portals/debug', val)
2143 except NameError, e:
2146 def sys_set_subsystem():
2147 if config.subsystem != None:
2149 val = eval(config.subsystem, subsystem_names)
2150 val = "0x%x" % (val,)
2151 sysctl('portals/subsystem_debug', val)
2152 except NameError, e:
2155 def sys_set_netmem_max(path, max):
2156 debug("setting", path, "to at least", max)
2164 fp = open(path, 'w')
2165 fp.write('%d\n' %(max))
2169 def sys_make_devices():
2170 if not os.access('/dev/portals', os.R_OK):
2171 run('mknod /dev/portals c 10 240')
2172 if not os.access('/dev/obd', os.R_OK):
2173 run('mknod /dev/obd c 10 241')
2176 # Add dir to the global PATH, if not already there.
2177 def add_to_path(new_dir):
2178 syspath = string.split(os.environ['PATH'], ':')
2179 if new_dir in syspath:
2181 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2183 def default_debug_path():
2184 path = '/tmp/lustre-log'
2185 if os.path.isdir('/r'):
2190 def default_gdb_script():
2191 script = '/tmp/ogdb'
2192 if os.path.isdir('/r'):
2193 return '/r' + script
2198 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2199 # ensure basic elements are in the system path
2200 def sanitise_path():
2201 for dir in DEFAULT_PATH:
2204 # global hack for the --select handling
2206 def init_select(arg):
2207 # arg = "service=nodeA,service2=nodeB"
2210 list = string.split(arg, ',')
2212 srv, node = string.split(entry, '=')
2213 tgt_select[srv] = node
2215 def get_select(srv):
2216 if tgt_select.has_key(srv):
2217 return tgt_select[srv]
2221 PARAM = Lustre.Options.PARAM
2222 INTPARAM = Lustre.Options.INTPARAM
2224 ('verbose,v', "Print system commands as they are run"),
2225 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2226 ('config', "Cluster config name used for LDAP query", PARAM),
2227 ('select', "service=nodeA,service2=nodeB ", PARAM),
2228 ('node', "Load config for <nodename>", PARAM),
2229 ('cleanup,d', "Cleans up config. (Shutdown)"),
2230 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2231 Lustre.Options.FLAG, 0),
2232 ('mds_ost_conn', "Open connections to OSTs on the MDS"),
2233 ('failover',"""Used to shut down without saving state.
2234 This will allow this node to "give up" a service to a
2235 another node for failover purposes. This will not
2236 be a clean shutdown.""",
2237 Lustre.Options.FLAG, 0),
2238 ('gdb', """Prints message after creating gdb module script
2239 and sleeps for 5 seconds."""),
2240 ('noexec,n', """Prints the commands and steps that will be run for a
2241 config without executing them. This can used to check if a
2242 config file is doing what it should be doing"""),
2243 ('nomod', "Skip load/unload module step."),
2244 ('nosetup', "Skip device setup/cleanup step."),
2245 ('reformat', "Reformat all devices (without question)"),
2246 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2248 ('minlevel', "Minimum level of services to configure/cleanup",
2250 ('maxlevel', """Maximum level of services to configure/cleanup
2251 Levels are aproximatly like:
2256 70 - mountpoint, echo_client, osc, mdc, lov""",
2258 ('lustre', """Base directory of lustre sources. This parameter will
2259 cause lconf to load modules from a source tree.""", PARAM),
2260 ('portals', """Portals source directory. If this is a relative path,
2261 then it is assumed to be relative to lustre. """, PARAM),
2262 ('timeout', "Set recovery timeout", PARAM),
2263 ('upcall', "Set both portals and lustre upcall script", PARAM),
2264 ('lustre_upcall', "Set lustre upcall script", PARAM),
2265 ('portals_upcall', "Set portals upcall script", PARAM),
2266 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2267 ('ptldebug', "Set the portals debug level", PARAM),
2268 ('subsystem', "Set the portals debug subsystem", PARAM),
2269 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2270 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2271 # Client recovery options
2272 ('recover', "Recover a device"),
2273 ('group', "The group of devices to configure or cleanup", PARAM),
2274 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2275 ('client_uuid', "The failed client (required for recovery)", PARAM),
2276 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2280 global lctl, config, toplevel
2282 # in the upcall this is set to SIG_IGN
2283 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2285 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2287 config, args = cl.parse(sys.argv[1:])
2288 except Lustre.OptionError, e:
2292 setupModulePath(sys.argv[0])
2294 host = socket.gethostname()
2296 # the PRNG is normally seeded with time(), which is not so good for starting
2297 # time-synchronized clusters
2298 input = open('/dev/urandom', 'r')
2300 print 'Unable to open /dev/urandom!'
2302 seed = input.read(32)
2308 init_select(config.select)
2311 if not os.access(args[0], os.R_OK):
2312 print 'File not found or readable:', args[0]
2315 dom = xml.dom.minidom.parse(args[0])
2317 panic("%s does not appear to be a config file." % (args[0]))
2318 sys.exit(1) # make sure to die here, even in debug mode.
2319 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2320 if not config.config:
2321 config.config = os.path.basename(args[0])# use full path?
2322 if config.config[-4:] == '.xml':
2323 config.config = config.config[:-4]
2324 elif config.ldapurl:
2325 if not config.config:
2326 panic("--ldapurl requires --config name")
2327 dn = "config=%s,fs=lustre" % (config.config)
2328 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2330 print 'Missing config file or ldap URL.'
2331 print 'see lconf --help for command summary'
2336 ver = db.get_version()
2338 panic("No version found in config data, please recreate.")
2339 if ver != Lustre.CONFIG_VERSION:
2340 panic("Config version", ver, "does not match lconf version",
2341 Lustre.CONFIG_VERSION)
2345 node_list.append(config.node)
2348 node_list.append(host)
2349 node_list.append('localhost')
2351 debug("configuring for host: ", node_list)
2354 config.debug_path = config.debug_path + '-' + host
2355 config.gdb_script = config.gdb_script + '-' + host
2357 lctl = LCTLInterface('lctl')
2359 if config.lctl_dump:
2360 lctl.use_save_file(config.lctl_dump)
2362 doHost(db, node_list)
2364 if __name__ == "__main__":
2367 except Lustre.LconfError, e:
2369 # traceback.print_exc(file=sys.stdout)
2371 except CommandError, e:
2375 if first_cleanup_error:
2376 sys.exit(first_cleanup_error)