3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile.am", os.R_OK):
46 if not development_mode():
47 sys.path.append(PYMOD_DIR)
53 DEFAULT_TCPBUF = 8388608
56 # Maximum number of devices to search for.
57 # (the /dev/loop* nodes need to be created beforehand)
58 MAX_LOOP_DEVICES = 256
59 PORTALS_DIR = 'portals'
61 # Needed to call lconf --record
64 # Please keep these in sync with the values in portals/kp30.h
76 "warning" : (1 << 10),
80 "portals" : (1 << 14),
82 "dlmtrace" : (1 << 16),
86 "rpctrace" : (1 << 20),
87 "vfstrace" : (1 << 21),
91 "undefined" : (0 << 24),
100 "ext2obd" : (9 << 24),
101 "portals" : (10 << 24),
102 "socknal" : (11 << 24),
103 "qswnal" : (12 << 24),
104 "pinger" : (13 << 24),
105 "filter" : (14 << 24),
106 "trace" : (15 << 24),
110 "gmnal" : (19 << 24),
111 "ptlrouter" : (20 << 24),
113 "ptlbd" : (22 << 24),
119 first_cleanup_error = 0
120 def cleanup_error(rc):
121 global first_cleanup_error
122 if not first_cleanup_error:
123 first_cleanup_error = rc
125 # ============================================================
126 # debugging and error funcs
128 def fixme(msg = "this feature"):
129 raise Lustre.LconfError, msg + ' not implmemented yet.'
132 msg = string.join(map(str,args))
133 if not config.noexec:
134 raise Lustre.LconfError(msg)
139 msg = string.join(map(str,args))
144 print string.strip(s)
148 msg = string.join(map(str,args))
151 # ack, python's builtin int() does not support '0x123' syntax.
152 # eval can do it, although what a hack!
156 return eval(s, {}, {})
159 except SyntaxError, e:
160 raise ValueError("not a number")
162 raise ValueError("not a number")
164 # ============================================================
165 # locally defined exceptions
166 class CommandError (exceptions.Exception):
167 def __init__(self, cmd_name, cmd_err, rc=None):
168 self.cmd_name = cmd_name
169 self.cmd_err = cmd_err
174 if type(self.cmd_err) == types.StringType:
176 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
178 print "! %s: %s" % (self.cmd_name, self.cmd_err)
179 elif type(self.cmd_err) == types.ListType:
181 print "! %s (error %d):" % (self.cmd_name, self.rc)
183 print "! %s:" % (self.cmd_name)
184 for s in self.cmd_err:
185 print "> %s" %(string.strip(s))
190 # ============================================================
191 # handle daemons, like the acceptor
193 """ Manage starting and stopping a daemon. Assumes daemon manages
194 it's own pid file. """
196 def __init__(self, cmd):
202 log(self.command, "already running.")
204 self.path = find_prog(self.command)
206 panic(self.command, "not found.")
207 ret, out = runcmd(self.path +' '+ self.command_line())
209 raise CommandError(self.path, out, ret)
213 pid = self.read_pidfile()
215 log ("killing process", pid)
217 #time.sleep(1) # let daemon die
219 log("unable to kill", self.command, e)
221 log("unable to kill", self.command)
224 pid = self.read_pidfile()
234 def read_pidfile(self):
236 fp = open(self.pidfile(), 'r')
243 def clean_pidfile(self):
244 """ Remove a stale pidfile """
245 log("removing stale pidfile:", self.pidfile())
247 os.unlink(self.pidfile())
249 log(self.pidfile(), e)
251 class AcceptorHandler(DaemonHandler):
252 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
253 DaemonHandler.__init__(self, "acceptor")
256 self.send_mem = send_mem
257 self.recv_mem = recv_mem
259 if net_type == 'toe':
260 self.flags = self.flags + ' -N 4'
262 self.flags = self.flags + ' -i'
265 return "/var/run/%s-%d.pid" % (self.command, self.port)
267 def command_line(self):
268 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
272 # start the acceptors
274 if config.lctl_dump or config.record:
276 for port in acceptors.keys():
277 daemon = acceptors[port]
278 if not daemon.running():
281 def run_one_acceptor(port):
282 if config.lctl_dump or config.record:
284 if acceptors.has_key(port):
285 daemon = acceptors[port]
286 if not daemon.running():
289 panic("run_one_acceptor: No acceptor defined for port:", port)
291 def stop_acceptor(port):
292 if acceptors.has_key(port):
293 daemon = acceptors[port]
298 # ============================================================
299 # handle lctl interface
302 Manage communication with lctl
305 def __init__(self, cmd):
307 Initialize close by finding the lctl binary.
309 self.lctl = find_prog(cmd)
311 self.record_device = ''
314 debug('! lctl not found')
317 raise CommandError('lctl', "unable to find lctl binary.")
319 def use_save_file(self, file):
320 self.save_file = file
322 def record(self, dev_name, logname):
323 log("Recording log", logname, "on", dev_name)
324 self.record_device = dev_name
325 self.record_log = logname
327 def end_record(self):
328 log("End recording log", self.record_log, "on", self.record_device)
329 self.record_device = None
330 self.record_log = None
332 def set_nonblock(self, fd):
333 fl = fcntl.fcntl(fd, F_GETFL)
334 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
339 the cmds are written to stdin of lctl
340 lctl doesn't return errors when run in script mode, so
342 should modify command line to accept multiple commands, or
343 create complex command line options
347 cmds = '\n dump ' + self.save_file + '\n' + cmds
348 elif self.record_device:
353 %s""" % (self.record_device, self.record_log, cmds)
355 debug("+", cmd_line, cmds)
356 if config.noexec: return (0, [])
358 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
359 child.tochild.write(cmds + "\n")
360 child.tochild.close()
362 # From "Python Cookbook" from O'Reilly
363 outfile = child.fromchild
364 outfd = outfile.fileno()
365 self.set_nonblock(outfd)
366 errfile = child.childerr
367 errfd = errfile.fileno()
368 self.set_nonblock(errfd)
370 outdata = errdata = ''
373 ready = select.select([outfd,errfd],[],[]) # Wait for input
374 if outfd in ready[0]:
375 outchunk = outfile.read()
376 if outchunk == '': outeof = 1
377 outdata = outdata + outchunk
378 if errfd in ready[0]:
379 errchunk = errfile.read()
380 if errchunk == '': erreof = 1
381 errdata = errdata + errchunk
382 if outeof and erreof: break
383 # end of "borrowed" code
386 if os.WIFEXITED(ret):
387 rc = os.WEXITSTATUS(ret)
390 if rc or len(errdata):
391 raise CommandError(self.lctl, errdata, rc)
394 def runcmd(self, *args):
396 run lctl using the command line
398 cmd = string.join(map(str,args))
399 debug("+", self.lctl, cmd)
400 rc, out = run(self.lctl, cmd)
402 raise CommandError(self.lctl, out, rc)
406 def network(self, net, nid):
411 quit """ % (net, nid)
414 # create a new connection
415 def add_uuid(self, net_type, uuid, nid):
416 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
419 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
421 if net_type in ('tcp', 'toe') and not config.lctl_dump:
426 add_autoconn %s %s %d %s
430 nid, hostaddr, port, flags )
433 def connect(self, srv):
434 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
435 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
439 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
440 srv.nid, srv.hostaddr, srv.port, flags)
443 def recover(self, dev_name, new_conn):
447 recover %s""" %(dev_name, new_conn)
450 # add a route to a range
451 def add_route(self, net, gw, lo, hi):
459 except CommandError, e:
463 def del_route(self, net, gw, lo, hi):
468 quit """ % (net, gw, lo, hi)
471 # add a route to a host
472 def add_route_host(self, net, uuid, gw, tgt):
473 self.add_uuid(net, uuid, tgt)
481 except CommandError, e:
485 # add a route to a range
486 def del_route_host(self, net, uuid, gw, tgt):
492 quit """ % (net, gw, tgt)
496 def del_autoconn(self, net_type, nid, hostaddr):
497 if net_type in ('tcp', 'toe') and not config.lctl_dump:
506 # disconnect one connection
507 def disconnect(self, srv):
508 self.del_uuid(srv.nid_uuid)
509 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
510 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
512 def del_uuid(self, uuid):
520 def disconnectAll(self, net):
528 def attach(self, type, name, uuid):
531 quit""" % (type, name, uuid)
534 def setup(self, name, setup = ""):
538 quit""" % (name, setup)
542 # create a new device with lctl
543 def newdev(self, type, name, uuid, setup = ""):
544 self.attach(type, name, uuid);
546 self.setup(name, setup)
547 except CommandError, e:
548 self.cleanup(name, uuid, 0)
553 def cleanup(self, name, uuid, force, failover = 0):
554 if failover: force = 1
560 quit""" % (name, ('', 'force')[force],
561 ('', 'failover')[failover])
565 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
566 stripe_sz, stripe_off,
570 lov_setup %s %d %d %d %s %s
571 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
576 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
580 lov_setconfig %s %d %d %d %s %s
581 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
585 def dump(self, dump_file):
588 quit""" % (dump_file)
591 # get list of devices
592 def device_list(self):
593 devices = '/proc/fs/lustre/devices'
595 if os.access(devices, os.R_OK):
597 fp = open(devices, 'r')
605 def lustre_version(self):
606 rc, out = self.runcmd('version')
610 def mount_option(self, profile, osc, mdc):
612 mount_option %s %s %s
613 quit""" % (profile, osc, mdc)
616 # delete mount options
617 def del_mount_option(self, profile):
623 def set_timeout(self, timeout):
629 # delete mount options
630 def set_lustre_upcall(self, upcall):
635 # ============================================================
636 # Various system-level functions
637 # (ideally moved to their own module)
639 # Run a command and return the output and status.
640 # stderr is sent to /dev/null, could use popen3 to
641 # save it if necessary
644 if config.noexec: return (0, [])
645 f = os.popen(cmd + ' 2>&1')
655 cmd = string.join(map(str,args))
658 # Run a command in the background.
659 def run_daemon(*args):
660 cmd = string.join(map(str,args))
662 if config.noexec: return 0
663 f = os.popen(cmd + ' 2>&1')
671 # Determine full path to use for an external command
672 # searches dirname(argv[0]) first, then PATH
674 syspath = string.split(os.environ['PATH'], ':')
675 cmdpath = os.path.dirname(sys.argv[0])
676 syspath.insert(0, cmdpath);
678 syspath.insert(0, os.path.join(config.portals, 'utils/'))
680 prog = os.path.join(d,cmd)
681 if os.access(prog, os.X_OK):
685 # Recursively look for file starting at base dir
686 def do_find_file(base, mod):
687 fullname = os.path.join(base, mod)
688 if os.access(fullname, os.R_OK):
690 for d in os.listdir(base):
691 dir = os.path.join(base,d)
692 if os.path.isdir(dir):
693 module = do_find_file(dir, mod)
697 def find_module(src_dir, dev_dir, modname):
698 mod = '%s.o' % (modname)
699 module = src_dir +'/'+ dev_dir +'/'+ mod
701 if os.access(module, os.R_OK):
707 # is the path a block device?
714 return stat.S_ISBLK(s[stat.ST_MODE])
716 # build fs according to type
718 def mkfs(dev, devsize, fstype, jsize, mkfsoptions, isblock=1):
723 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
725 # devsize is in 1k, and fs block count is in 4k
726 block_cnt = devsize/4
728 if fstype in ('ext3', 'extN'):
729 # ext3 journal size is in megabytes
730 if jsize: jopt = "-J size=%d" %(jsize,)
731 mkfs = 'mkfs.ext2 -j -b 4096 '
732 if not isblock or config.force:
734 elif fstype == 'reiserfs':
735 # reiserfs journal size is in blocks
736 if jsize: jopt = "--journal_size %d" %(jsize,)
737 mkfs = 'mkreiserfs -ff'
739 panic('unsupported fs type: ', fstype)
741 if config.mkfsoptions != None:
742 mkfs = mkfs + ' ' + config.mkfsoptions
743 if mkfsoptions != None:
744 mkfs = mkfs + ' ' + mkfsoptions
745 (ret, out) = run (mkfs, jopt, dev, block_cnt)
747 panic("Unable to build fs:", dev, string.join(out))
748 # enable hash tree indexing on fsswe
749 if fstype in ('ext3', 'extN'):
750 htree = 'echo "feature FEATURE_C5" | debugfs -w'
751 (ret, out) = run (htree, dev)
753 panic("Unable to enable htree:", dev)
755 # some systems use /dev/loopN, some /dev/loop/N
759 if not os.access(loop + str(0), os.R_OK):
761 if not os.access(loop + str(0), os.R_OK):
762 panic ("can't access loop devices")
765 # find loop device assigned to thefile
768 for n in xrange(0, MAX_LOOP_DEVICES):
770 if os.access(dev, os.R_OK):
771 (stat, out) = run('losetup', dev)
772 if out and stat == 0:
773 m = re.search(r'\((.*)\)', out[0])
774 if m and file == m.group(1):
780 # create file if necessary and assign the first free loop device
781 def init_loop(file, size, fstype, journal_size, mkfsoptions, reformat):
782 dev = find_loop(file)
784 print 'WARNING file:', file, 'already mapped to', dev
786 if reformat or not os.access(file, os.R_OK | os.W_OK):
788 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
789 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
792 panic("Unable to create backing store:", file)
793 mkfs(file, size, fstype, journal_size, mkfsoptions, isblock=0)
796 # find next free loop
797 for n in xrange(0, MAX_LOOP_DEVICES):
799 if os.access(dev, os.R_OK):
800 (stat, out) = run('losetup', dev)
802 run('losetup', dev, file)
805 print "out of loop devices"
807 print "out of loop devices"
810 # undo loop assignment
811 def clean_loop(file):
812 dev = find_loop(file)
814 ret, out = run('losetup -d', dev)
816 log('unable to clean loop device:', dev, 'for file:', file)
819 # determine if dev is formatted as a <fstype> filesystem
820 def need_format(fstype, dev):
821 # FIXME don't know how to implement this
824 # initialize a block device if needed
825 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
827 if config.noexec: return dev
828 if not is_block(dev):
829 dev = init_loop(dev, size, fstype, journal_size, mkfsoptions, reformat)
830 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
831 mkfs(dev, size, fstype, journal_size, mkfsoptions, isblock=0)
834 # panic("device:", dev,
835 # "not prepared, and autoformat is not set.\n",
836 # "Rerun with --reformat option to format ALL filesystems")
841 """lookup IP address for an interface"""
842 rc, out = run("/sbin/ifconfig", iface)
845 addr = string.split(out[1])[1]
846 ip = string.split(addr, ':')[1]
849 def sys_get_local_nid(net_type, wildcard, cluster_id):
850 """Return the local nid."""
852 if os.access('/proc/elan/device0/position', os.R_OK):
853 local = sys_get_local_address('elan', '*', cluster_id)
855 local = sys_get_local_address(net_type, wildcard, cluster_id)
858 def sys_get_local_address(net_type, wildcard, cluster_id):
859 """Return the local address for the network type."""
861 if net_type in ('tcp', 'toe'):
863 iface, star = string.split(wildcard, ':')
864 local = if2addr(iface)
866 panic ("unable to determine ip for:", wildcard)
868 host = socket.gethostname()
869 local = socket.gethostbyname(host)
870 elif net_type == 'elan':
871 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
873 fp = open('/proc/elan/device0/position', 'r')
874 lines = fp.readlines()
882 nid = my_int(cluster_id) + my_int(elan_id)
884 except ValueError, e:
888 elif net_type == 'gm':
889 fixme("automatic local address for GM")
890 elif net_type == 'scimac':
891 scinode="/opt/scali/sbin/scinode"
892 if os.path.exists(scinode):
893 (rc,local) = run(scinode)
895 panic (scinode, " not found on node with scimac networking")
897 panic (scinode, " failed")
898 local=string.rstrip(local[0])
902 def mod_loaded(modname):
903 """Check if a module is already loaded. Look in /proc/modules for it."""
905 fp = open('/proc/modules')
906 lines = fp.readlines()
908 # please forgive my tired fingers for this one
909 ret = filter(lambda word, mod=modname: word == mod,
910 map(lambda line: string.split(line)[0], lines))
915 # XXX: instead of device_list, ask for $name and see what we get
916 def is_prepared(name):
917 """Return true if a device exists for the name"""
920 if (config.noexec or config.record) and config.cleanup:
923 # expect this format:
924 # 1 UP ldlm ldlm ldlm_UUID 2
925 out = lctl.device_list()
927 if name == string.split(s)[3]:
929 except CommandError, e:
933 def is_network_prepared():
934 """If the any device exists, then assume that all networking
935 has been configured"""
936 out = lctl.device_list()
939 def fs_is_mounted(path):
940 """Return true if path is a mounted lustre filesystem"""
942 fp = open('/proc/mounts')
943 lines = fp.readlines()
947 if a[1] == path and a[2] == 'lustre_lite':
955 """Manage kernel modules"""
956 def __init__(self, lustre_dir, portals_dir):
957 self.lustre_dir = lustre_dir
958 self.portals_dir = portals_dir
959 self.kmodule_list = []
961 def add_portals_module(self, dev_dir, modname):
962 """Append a module to list of modules to load."""
963 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
965 def add_lustre_module(self, dev_dir, modname):
966 """Append a module to list of modules to load."""
967 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
969 def load_module(self):
970 """Load all the modules in the list in the order they appear."""
971 for src_dir, dev_dir, mod in self.kmodule_list:
972 if mod_loaded(mod) and not config.noexec:
974 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
976 module = find_module(src_dir, dev_dir, mod)
978 panic('module not found:', mod)
979 (rc, out) = run('/sbin/insmod', module)
981 raise CommandError('insmod', out, rc)
983 (rc, out) = run('/sbin/modprobe', mod)
985 raise CommandError('modprobe', out, rc)
987 def cleanup_module(self):
988 """Unload the modules in the list in reverse order."""
989 rev = self.kmodule_list
991 for src_dir, dev_dir, mod in rev:
992 if not mod_loaded(mod) and not config.noexec:
995 if mod == 'portals' and config.dump:
996 lctl.dump(config.dump)
997 log('unloading module:', mod)
998 (rc, out) = run('/sbin/rmmod', mod)
1000 log('! unable to unload module:', mod)
1003 # ============================================================
1004 # Classes to prepare and cleanup the various objects
1007 """ Base class for the rest of the modules. The default cleanup method is
1008 defined here, as well as some utilitiy funcs.
1010 def __init__(self, module_name, db):
1012 self.module_name = module_name
1013 self.name = self.db.getName()
1014 self.uuid = self.db.getUUID()
1017 self.kmod = kmod(config.lustre, config.portals)
1019 def info(self, *args):
1020 msg = string.join(map(str,args))
1021 print self.module_name + ":", self.name, self.uuid, msg
1024 """ default cleanup, used for most modules """
1027 lctl.cleanup(self.name, self.uuid, config.force)
1028 except CommandError, e:
1029 log(self.module_name, "cleanup failed: ", self.name)
1033 def add_portals_module(self, dev_dir, modname):
1034 """Append a module to list of modules to load."""
1035 self.kmod.add_portals_module(dev_dir, modname)
1037 def add_lustre_module(self, dev_dir, modname):
1038 """Append a module to list of modules to load."""
1039 self.kmod.add_lustre_module(dev_dir, modname)
1041 def load_module(self):
1042 """Load all the modules in the list in the order they appear."""
1043 self.kmod.load_module()
1045 def cleanup_module(self):
1046 """Unload the modules in the list in reverse order."""
1047 if self.safe_to_clean():
1048 self.kmod.cleanup_module()
1050 def safe_to_clean(self):
1053 def safe_to_clean_modules(self):
1054 return self.safe_to_clean()
1056 class Network(Module):
1057 def __init__(self,db):
1058 Module.__init__(self, 'NETWORK', db)
1059 self.net_type = self.db.get_val('nettype')
1060 self.nid = self.db.get_val('nid', '*')
1061 self.cluster_id = self.db.get_val('clusterid', "0")
1062 self.port = self.db.get_val_int('port', 0)
1063 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1064 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1065 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1068 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1070 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1071 self.generic_nid = 1
1072 debug("nid:", self.nid)
1074 self.generic_nid = 0
1076 self.nid_uuid = self.nid_to_uuid(self.nid)
1078 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1079 if '*' in self.hostaddr:
1080 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1081 if not self.hostaddr:
1082 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1083 debug("hostaddr:", self.hostaddr)
1085 self.add_portals_module("libcfs", 'portals')
1086 if node_needs_router():
1087 self.add_portals_module("router", 'kptlrouter')
1088 if self.net_type == 'tcp':
1089 self.add_portals_module("knals/socknal", 'ksocknal')
1090 if self.net_type == 'toe':
1091 self.add_portals_module("knals/toenal", 'ktoenal')
1092 if self.net_type == 'elan':
1093 self.add_portals_module("knals/qswnal", 'kqswnal')
1094 if self.net_type == 'gm':
1095 self.add_portals_module("knals/gmnal", 'kgmnal')
1096 if self.net_type == 'scimac':
1097 self.add_portals_module("knals/scimacnal", 'kscimacnal')
1099 def nid_to_uuid(self, nid):
1100 return "NID_%s_UUID" %(nid,)
1103 if is_network_prepared():
1105 self.info(self.net_type, self.nid, self.port)
1106 if not (config.record and self.generic_nid):
1107 lctl.network(self.net_type, self.nid)
1108 if self.net_type == 'tcp':
1110 if self.net_type == 'elan':
1112 if self.port and node_is_router():
1113 run_one_acceptor(self.port)
1114 self.connect_peer_gateways()
1116 def connect_peer_gateways(self):
1117 for router in self.db.lookup_class('node'):
1118 if router.get_val_int('router', 0):
1119 for netuuid in router.get_networks():
1120 net = self.db.lookup(netuuid)
1122 if (gw.cluster_id == self.cluster_id and
1123 gw.net_type == self.net_type):
1124 if gw.nid != self.nid:
1127 def disconnect_peer_gateways(self):
1128 for router in self.db.lookup_class('node'):
1129 if router.get_val_int('router', 0):
1130 for netuuid in router.get_networks():
1131 net = self.db.lookup(netuuid)
1133 if (gw.cluster_id == self.cluster_id and
1134 gw.net_type == self.net_type):
1135 if gw.nid != self.nid:
1138 except CommandError, e:
1139 print "disconnect failed: ", self.name
1143 def safe_to_clean(self):
1144 return not is_network_prepared()
1147 self.info(self.net_type, self.nid, self.port)
1149 stop_acceptor(self.port)
1150 if node_is_router():
1151 self.disconnect_peer_gateways()
1153 class RouteTable(Module):
1154 def __init__(self,db):
1155 Module.__init__(self, 'ROUTES', db)
1157 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1159 # only setup connections for tcp NALs
1161 if not net_type in ('tcp', 'toe'):
1164 # connect to target if route is to single node and this node is the gw
1165 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1166 if not local_cluster(net_type, tgt_cluster_id):
1167 panic("target", lo, " not on the local cluster")
1168 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1169 # connect to gateway if this node is not the gw
1170 elif (local_cluster(net_type, gw_cluster_id)
1171 and not local_interface(net_type, gw_cluster_id, gw)):
1172 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1177 panic("no server for nid", lo)
1180 return Network(srvdb)
1183 if is_network_prepared():
1186 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1187 lctl.add_route(net_type, gw, lo, hi)
1188 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1192 def safe_to_clean(self):
1193 return not is_network_prepared()
1196 if is_network_prepared():
1197 # the network is still being used, don't clean it up
1199 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1200 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1203 lctl.disconnect(srv)
1204 except CommandError, e:
1205 print "disconnect failed: ", self.name
1210 lctl.del_route(net_type, gw, lo, hi)
1211 except CommandError, e:
1212 print "del_route failed: ", self.name
1216 class Management(Module):
1217 def __init__(self, db):
1218 Module.__init__(self, 'MGMT', db)
1219 self.add_lustre_module('lvfs', 'lvfs')
1220 self.add_lustre_module('obdclass', 'obdclass')
1221 self.add_lustre_module('ptlrpc', 'ptlrpc')
1222 self.add_lustre_module('mgmt', 'mgmt_svc')
1225 if is_prepared(self.name):
1228 lctl.newdev("mgmt", self.name, self.uuid)
1230 def safe_to_clean(self):
1234 if is_prepared(self.name):
1235 Module.cleanup(self)
1237 # This is only needed to load the modules; the LDLM device
1238 # is now created automatically.
1240 def __init__(self,db):
1241 Module.__init__(self, 'LDLM', db)
1242 self.add_lustre_module('lvfs', 'lvfs')
1243 self.add_lustre_module('obdclass', 'obdclass')
1244 self.add_lustre_module('ptlrpc', 'ptlrpc')
1253 def __init__(self, db, uuid, fs_name, name_override = None):
1254 Module.__init__(self, 'LOV', db)
1255 if name_override != None:
1256 self.name = "lov_%s" % name_override
1257 self.add_lustre_module('lov', 'lov')
1258 self.mds_uuid = self.db.get_first_ref('mds')
1259 mds= self.db.lookup(self.mds_uuid)
1260 self.mds_name = mds.getName()
1261 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1262 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1263 self.pattern = self.db.get_val_int('stripepattern', 0)
1264 self.devlist = self.db.get_refs('obd')
1265 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1267 self.desc_uuid = self.uuid
1268 self.uuid = generate_client_uuid(self.name)
1269 self.fs_name = fs_name
1270 for obd_uuid in self.devlist:
1271 obd = self.db.lookup(obd_uuid)
1272 osc = get_osc(obd, self.uuid, fs_name)
1274 self.osclist.append(osc)
1276 panic('osc not found:', obd_uuid)
1279 if is_prepared(self.name):
1281 for osc in self.osclist:
1283 # Only ignore connect failures with --force, which
1284 # isn't implemented here yet.
1285 osc.prepare(ignore_connect_failure=0)
1286 except CommandError, e:
1287 print "Error preparing OSC %s\n" % osc.uuid
1289 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1290 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1291 lctl.lov_setup(self.name, self.uuid,
1292 self.desc_uuid, self.mds_name, self.stripe_cnt,
1293 self.stripe_sz, self.stripe_off, self.pattern,
1294 string.join(self.devlist))
1297 if is_prepared(self.name):
1298 Module.cleanup(self)
1299 for osc in self.osclist:
1302 def load_module(self):
1303 for osc in self.osclist:
1306 Module.load_module(self)
1308 def cleanup_module(self):
1309 Module.cleanup_module(self)
1310 for osc in self.osclist:
1311 osc.cleanup_module()
1314 class MDSDEV(Module):
1315 def __init__(self,db):
1316 Module.__init__(self, 'MDSDEV', db)
1317 self.devpath = self.db.get_val('devpath','')
1318 self.size = self.db.get_val_int('devsize', 0)
1319 self.journal_size = self.db.get_val_int('journalsize', 0)
1320 self.fstype = self.db.get_val('fstype', '')
1321 self.nspath = self.db.get_val('nspath', '')
1322 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1323 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1324 target_uuid = self.db.get_first_ref('target')
1325 mds = self.db.lookup(target_uuid)
1326 self.name = mds.getName()
1327 self.filesystem_uuids = mds.get_refs('filesystem')
1328 # FIXME: if fstype not set, then determine based on kernel version
1329 self.format = self.db.get_val('autoformat', "no")
1330 if mds.get_val('failover', 0):
1331 self.failover_mds = 'f'
1333 self.failover_mds = 'n'
1334 active_uuid = get_active_target(mds)
1336 panic("No target device found:", target_uuid)
1337 if active_uuid == self.uuid:
1341 if self.active and config.group and config.group != ost.get_val('group'):
1344 self.target_dev_uuid = self.uuid
1345 self.uuid = target_uuid
1347 self.add_lustre_module('mdc', 'mdc')
1348 self.add_lustre_module('osc', 'osc')
1349 self.add_lustre_module('lov', 'lov')
1350 self.add_lustre_module('mds', 'mds')
1352 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1355 def load_module(self):
1357 Module.load_module(self)
1360 if is_prepared(self.name):
1363 debug(self.uuid, "not active")
1366 # run write_conf automatically, if --reformat used
1368 self.info(self.devpath, self.fstype, self.size, self.format)
1370 # never reformat here
1371 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1372 self.format, self.journal_size, self.mkfsoptions)
1373 if not is_prepared('MDT'):
1374 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1376 lctl.newdev("mds", self.name, self.uuid,
1377 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1378 except CommandError, e:
1380 panic("MDS is missing the config log. Need to run " +
1381 "lconf --write_conf.")
1385 def write_conf(self):
1386 if is_prepared(self.name):
1388 self.info(self.devpath, self.fstype, self.format)
1389 blkdev = block_dev(self.devpath, self.size, self.fstype,
1390 config.reformat, self.format, self.journal_size,
1392 lctl.newdev("mds", self.name, self.uuid,
1393 setup ="%s %s" %(blkdev, self.fstype))
1395 # record logs for the MDS lov
1396 for uuid in self.filesystem_uuids:
1397 log("recording clients for filesystem:", uuid)
1398 fs = self.db.lookup(uuid)
1399 obd_uuid = fs.get_first_ref('obd')
1400 client_uuid = generate_client_uuid(self.name)
1401 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1404 lctl.record(self.name, self.name)
1406 lctl.mount_option(self.name, client.get_name(), "")
1410 lctl.record(self.name, self.name + '-clean')
1412 lctl.del_mount_option(self.name)
1417 # record logs for each client
1419 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1421 config_options = CONFIG_FILE
1423 for node_db in self.db.lookup_class('node'):
1424 client_name = node_db.getName()
1425 for prof_uuid in node_db.get_refs('profile'):
1426 prof_db = node_db.lookup(prof_uuid)
1427 # refactor this into a funtion to test "clientness"
1429 for ref_class, ref_uuid in prof_db.get_all_refs():
1430 if ref_class in ('mountpoint','echoclient'):
1431 debug("recording", client_name)
1432 old_noexec = config.noexec
1434 noexec_opt = ('', '-n')
1435 ret, out = run (sys.argv[0],
1436 noexec_opt[old_noexec == 1],
1437 " -v --record --nomod",
1438 "--record_log", client_name,
1439 "--record_device", self.name,
1440 "--node", client_name,
1443 for s in out: log("record> ", string.strip(s))
1444 ret, out = run (sys.argv[0],
1445 noexec_opt[old_noexec == 1],
1446 "--cleanup -v --record --nomod",
1447 "--record_log", client_name + "-clean",
1448 "--record_device", self.name,
1449 "--node", client_name,
1452 for s in out: log("record> ", string.strip(s))
1453 config.noexec = old_noexec
1455 lctl.cleanup(self.name, self.uuid, 0, 0)
1456 except CommandError, e:
1457 log(self.module_name, "cleanup failed: ", self.name)
1460 Module.cleanup(self)
1461 clean_loop(self.devpath)
1463 def msd_remaining(self):
1464 out = lctl.device_list()
1466 if string.split(s)[2] in ('mds',):
1469 def safe_to_clean(self):
1472 def safe_to_clean_modules(self):
1473 return not self.msd_remaining()
1477 debug(self.uuid, "not active")
1480 if is_prepared(self.name):
1482 lctl.cleanup(self.name, self.uuid, config.force,
1484 except CommandError, e:
1485 log(self.module_name, "cleanup failed: ", self.name)
1488 Module.cleanup(self)
1489 if not self.msd_remaining() and is_prepared('MDT'):
1491 lctl.cleanup("MDT", "MDT_UUID", config.force,
1493 except CommandError, e:
1494 print "cleanup failed: ", self.name
1497 clean_loop(self.devpath)
1500 def __init__(self, db):
1501 Module.__init__(self, 'OSD', db)
1502 self.osdtype = self.db.get_val('osdtype')
1503 self.devpath = self.db.get_val('devpath', '')
1504 self.size = self.db.get_val_int('devsize', 0)
1505 self.journal_size = self.db.get_val_int('journalsize', 0)
1506 self.mkfsoptions = self.db.get_val_int('mkfsoptions', '')
1507 self.fstype = self.db.get_val('fstype', '')
1508 self.nspath = self.db.get_val('nspath', '')
1509 target_uuid = self.db.get_first_ref('target')
1510 ost = self.db.lookup(target_uuid)
1511 self.name = ost.getName()
1512 self.format = self.db.get_val('autoformat', 'yes')
1513 if ost.get_val('failover', 0):
1514 self.failover_ost = 'f'
1516 self.failover_ost = 'n'
1518 active_uuid = get_active_target(ost)
1520 panic("No target device found:", target_uuid)
1521 if active_uuid == self.uuid:
1525 if self.active and config.group and config.group != ost.get_val('group'):
1528 self.target_dev_uuid = self.uuid
1529 self.uuid = target_uuid
1531 self.add_lustre_module('ost', 'ost')
1532 # FIXME: should we default to ext3 here?
1534 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1535 self.add_lustre_module(self.osdtype, self.osdtype)
1537 def load_module(self):
1539 Module.load_module(self)
1541 # need to check /proc/mounts and /etc/mtab before
1542 # formatting anything.
1543 # FIXME: check if device is already formatted.
1545 if is_prepared(self.name):
1548 debug(self.uuid, "not active")
1550 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1551 self.format, self.journal_size)
1553 if self.osdtype == 'obdecho':
1556 blkdev = block_dev(self.devpath, self.size, self.fstype,
1557 config.reformat, self.format, self.journal_size,
1559 lctl.newdev(self.osdtype, self.name, self.uuid,
1560 setup ="%s %s %s" %(blkdev, self.fstype,
1562 if not is_prepared('OSS'):
1563 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1565 def osd_remaining(self):
1566 out = lctl.device_list()
1568 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1571 def safe_to_clean(self):
1574 def safe_to_clean_modules(self):
1575 return not self.osd_remaining()
1579 debug(self.uuid, "not active")
1581 if is_prepared(self.name):
1584 lctl.cleanup(self.name, self.uuid, config.force,
1586 except CommandError, e:
1587 log(self.module_name, "cleanup failed: ", self.name)
1590 if not self.osd_remaining() and is_prepared('OSS'):
1592 lctl.cleanup("OSS", "OSS_UUID", config.force,
1594 except CommandError, e:
1595 print "cleanup failed: ", self.name
1598 if not self.osdtype == 'obdecho':
1599 clean_loop(self.devpath)
1601 def mgmt_uuid_for_fs(mtpt_name):
1604 mtpt_db = toplevel.lookup_name(mtpt_name)
1605 fs_uuid = mtpt_db.get_first_ref('filesystem')
1606 fs = toplevel.lookup(fs_uuid)
1609 return fs.get_first_ref('mgmt')
1611 # Generic client module, used by OSC and MDC
1612 class Client(Module):
1613 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1615 self.target_name = tgtdb.getName()
1616 self.target_uuid = tgtdb.getUUID()
1619 self.tgt_dev_uuid = get_active_target(tgtdb)
1620 if not self.tgt_dev_uuid:
1621 panic("No target device found for target:", self.target_name)
1623 self.kmod = kmod(config.lustre, config.portals)
1627 self.module = module
1628 self.module_name = string.upper(module)
1630 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1631 self.target_name, fs_name)
1633 self.name = self_name
1635 self.lookup_server(self.tgt_dev_uuid)
1636 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1638 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1641 self.fs_name = fs_name
1644 self.add_lustre_module(module_dir, module)
1646 def lookup_server(self, srv_uuid):
1647 """ Lookup a server's network information """
1648 self._server_nets = get_ost_net(self.db, srv_uuid)
1649 if len(self._server_nets) == 0:
1650 panic ("Unable to find a server for:", srv_uuid)
1652 def get_servers(self):
1653 return self._server_nets
1655 def prepare(self, ignore_connect_failure = 0):
1656 self.info(self.target_uuid)
1657 if is_prepared(self.name):
1660 srv = choose_local_server(self.get_servers())
1664 routes = find_route(self.get_servers())
1665 if len(routes) == 0:
1666 panic ("no route to", self.target_uuid)
1667 for (srv, r) in routes:
1668 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1669 except CommandError, e:
1670 if not ignore_connect_failure:
1673 if self.target_uuid in config.inactive and self.permits_inactive():
1674 debug("%s inactive" % self.target_uuid)
1675 inactive_p = "inactive"
1677 debug("%s active" % self.target_uuid)
1679 lctl.newdev(self.module, self.name, self.uuid,
1680 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1681 inactive_p, self.mgmt_name))
1684 if is_prepared(self.name):
1685 Module.cleanup(self)
1687 srv = choose_local_server(self.get_servers())
1689 lctl.disconnect(srv)
1691 for (srv, r) in find_route(self.get_servers()):
1692 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1693 except CommandError, e:
1694 log(self.module_name, "cleanup failed: ", self.name)
1700 def __init__(self, db, uuid, fs_name):
1701 Client.__init__(self, db, uuid, 'mdc', fs_name)
1703 def permits_inactive(self):
1707 def __init__(self, db, uuid, fs_name):
1708 Client.__init__(self, db, uuid, 'osc', fs_name)
1710 def permits_inactive(self):
1713 def mgmtcli_name_for_uuid(uuid):
1714 return 'MGMTCLI_%s' % uuid
1716 class ManagementClient(Client):
1717 def __init__(self, db, uuid):
1718 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1719 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1720 module_dir = 'mgmt')
1723 def __init__(self, db):
1724 Module.__init__(self, 'COBD', db)
1725 self.real_uuid = self.db.get_first_ref('realobd')
1726 self.cache_uuid = self.db.get_first_ref('cacheobd')
1727 self.add_lustre_module('cobd' , 'cobd')
1729 # need to check /proc/mounts and /etc/mtab before
1730 # formatting anything.
1731 # FIXME: check if device is already formatted.
1733 if is_prepared(self.name):
1735 self.info(self.real_uuid, self.cache_uuid)
1736 lctl.newdev("cobd", self.name, self.uuid,
1737 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1740 # virtual interface for OSC and LOV
1742 def __init__(self, db, uuid, fs_name, name_override = None):
1743 Module.__init__(self, 'VOSC', db)
1744 if db.get_class() == 'lov':
1745 self.osc = LOV(db, uuid, fs_name, name_override)
1747 self.osc = get_osc(db, uuid, fs_name)
1749 return self.osc.uuid
1751 return self.osc.name
1756 def load_module(self):
1757 self.osc.load_module()
1758 def cleanup_module(self):
1759 self.osc.cleanup_module()
1762 class ECHO_CLIENT(Module):
1763 def __init__(self,db):
1764 Module.__init__(self, 'ECHO_CLIENT', db)
1765 self.add_lustre_module('obdecho', 'obdecho')
1766 self.obd_uuid = self.db.get_first_ref('obd')
1767 obd = self.db.lookup(self.obd_uuid)
1768 self.uuid = generate_client_uuid(self.name)
1769 self.osc = VOSC(obd, self.uuid, self.name)
1772 if is_prepared(self.name):
1775 self.osc.prepare() # XXX This is so cheating. -p
1776 self.info(self.obd_uuid)
1778 lctl.newdev("echo_client", self.name, self.uuid,
1779 setup = self.osc.get_name())
1782 if is_prepared(self.name):
1783 Module.cleanup(self)
1786 def load_module(self):
1787 self.osc.load_module()
1788 Module.load_module(self)
1790 def cleanup_module(self):
1791 Module.cleanup_module(self)
1792 self.osc.cleanup_module()
1795 def generate_client_uuid(name):
1796 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1798 int(random.random() * 1048576),
1799 int(random.random() * 1048576))
1800 return client_uuid[:36]
1803 class Mountpoint(Module):
1804 def __init__(self,db):
1805 Module.__init__(self, 'MTPT', db)
1806 self.path = self.db.get_val('path')
1807 self.fs_uuid = self.db.get_first_ref('filesystem')
1808 fs = self.db.lookup(self.fs_uuid)
1809 self.mds_uuid = fs.get_first_ref('mds')
1810 self.obd_uuid = fs.get_first_ref('obd')
1811 self.mgmt_uuid = fs.get_first_ref('mgmt')
1812 obd = self.db.lookup(self.obd_uuid)
1813 client_uuid = generate_client_uuid(self.name)
1814 self.vosc = VOSC(obd, client_uuid, self.name)
1815 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1817 self.add_lustre_module('mdc', 'mdc')
1818 self.add_lustre_module('llite', 'llite')
1820 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1826 if fs_is_mounted(self.path):
1827 log(self.path, "already mounted.")
1831 self.mgmtcli.prepare()
1834 mdc_name = self.mdc.name
1836 self.info(self.path, self.mds_uuid, self.obd_uuid)
1837 if config.record or config.lctl_dump:
1838 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1840 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1841 (self.vosc.get_name(), mdc_name, config.config, self.path)
1842 run("mkdir", self.path)
1847 panic("mount failed:", self.path, ":", string.join(val))
1850 self.info(self.path, self.mds_uuid,self.obd_uuid)
1852 if config.record or config.lctl_dump:
1853 lctl.del_mount_option(local_node_name)
1855 if fs_is_mounted(self.path):
1857 (rc, out) = run("umount", "-f", self.path)
1859 (rc, out) = run("umount", self.path)
1861 raise CommandError('umount', out, rc)
1863 if fs_is_mounted(self.path):
1864 panic("fs is still mounted:", self.path)
1869 self.mgmtcli.cleanup()
1871 def load_module(self):
1873 self.mgmtcli.load_module()
1874 self.vosc.load_module()
1875 Module.load_module(self)
1877 def cleanup_module(self):
1878 Module.cleanup_module(self)
1879 self.vosc.cleanup_module()
1881 self.mgmtcli.cleanup_module()
1884 # ============================================================
1885 # misc query functions
1887 def get_ost_net(self, osd_uuid):
1891 osd = self.lookup(osd_uuid)
1892 node_uuid = osd.get_first_ref('node')
1893 node = self.lookup(node_uuid)
1895 panic("unable to find node for osd_uuid:", osd_uuid,
1896 " node_ref:", node_uuid)
1897 for net_uuid in node.get_networks():
1898 db = node.lookup(net_uuid)
1899 srv_list.append(Network(db))
1903 # the order of iniitailization is based on level.
1904 def getServiceLevel(self):
1905 type = self.get_class()
1907 if type in ('network',):
1909 elif type in ('routetbl',):
1911 elif type in ('ldlm',):
1913 elif type in ('mgmt',):
1915 elif type in ('osd', 'cobd'):
1917 elif type in ('mdsdev',):
1919 elif type in ('mountpoint', 'echoclient'):
1922 panic("Unknown type: ", type)
1924 if ret < config.minlevel or ret > config.maxlevel:
1929 # return list of services in a profile. list is a list of tuples
1930 # [(level, db_object),]
1931 def getServices(self):
1933 for ref_class, ref_uuid in self.get_all_refs():
1934 servdb = self.lookup(ref_uuid)
1936 level = getServiceLevel(servdb)
1938 list.append((level, servdb))
1940 panic('service not found: ' + ref_uuid)
1946 ############################################################
1948 # FIXME: clean this mess up!
1950 # OSC is no longer in the xml, so we have to fake it.
1951 # this is getting ugly and begging for another refactoring
1952 def get_osc(ost_db, uuid, fs_name):
1953 osc = OSC(ost_db, uuid, fs_name)
1956 def get_mdc(db, uuid, fs_name, mds_uuid):
1957 mds_db = db.lookup(mds_uuid);
1959 panic("no mds:", mds_uuid)
1960 mdc = MDC(mds_db, uuid, fs_name)
1963 ############################################################
1964 # routing ("rooting")
1966 # list of (nettype, cluster_id, nid)
1969 def find_local_clusters(node_db):
1970 global local_clusters
1971 for netuuid in node_db.get_networks():
1972 net = node_db.lookup(netuuid)
1974 debug("add_local", netuuid)
1975 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
1977 if acceptors.has_key(srv.port):
1978 panic("duplicate port:", srv.port)
1979 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1980 srv.send_mem, srv.recv_mem,
1983 # This node is a gateway.
1985 def node_is_router():
1988 # If there are any routers found in the config, then this will be true
1989 # and all nodes will load kptlrouter.
1991 def node_needs_router():
1992 return needs_router or is_router
1994 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1995 # Currently, these local routes are only added to kptlrouter route
1996 # table if they are needed to connect to a specific server. This
1997 # should be changed so all available routes are loaded, and the
1998 # ptlrouter can make all the decisions.
2001 def find_local_routes(lustre):
2002 """ Scan the lustre config looking for routers . Build list of
2004 global local_routes, needs_router
2006 list = lustre.lookup_class('node')
2008 if router.get_val_int('router', 0):
2010 for (local_type, local_cluster_id, local_nid) in local_clusters:
2012 for netuuid in router.get_networks():
2013 db = router.lookup(netuuid)
2014 if (local_type == db.get_val('nettype') and
2015 local_cluster_id == db.get_val('clusterid')):
2016 gw = db.get_val('nid')
2019 debug("find_local_routes: gw is", gw)
2020 for route in router.get_local_routes(local_type, gw):
2021 local_routes.append(route)
2022 debug("find_local_routes:", local_routes)
2025 def choose_local_server(srv_list):
2026 for srv in srv_list:
2027 if local_cluster(srv.net_type, srv.cluster_id):
2030 def local_cluster(net_type, cluster_id):
2031 for cluster in local_clusters:
2032 if net_type == cluster[0] and cluster_id == cluster[1]:
2036 def local_interface(net_type, cluster_id, nid):
2037 for cluster in local_clusters:
2038 if (net_type == cluster[0] and cluster_id == cluster[1]
2039 and nid == cluster[2]):
2043 def find_route(srv_list):
2045 frm_type = local_clusters[0][0]
2046 for srv in srv_list:
2047 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2048 to_type = srv.net_type
2050 cluster_id = srv.cluster_id
2051 debug ('looking for route to', to_type, to)
2052 for r in local_routes:
2053 debug("find_route: ", r)
2054 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2055 result.append((srv, r))
2058 def get_active_target(db):
2059 target_uuid = db.getUUID()
2060 target_name = db.getName()
2061 node_name = get_select(target_name)
2063 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2065 tgt_dev_uuid = db.get_first_ref('active')
2068 def get_server_by_nid_uuid(db, nid_uuid):
2069 for n in db.lookup_class("network"):
2071 if net.nid_uuid == nid_uuid:
2075 ############################################################
2079 type = db.get_class()
2080 debug('Service:', type, db.getName(), db.getUUID())
2085 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2086 elif type == 'network':
2088 elif type == 'routetbl':
2092 elif type == 'cobd':
2094 elif type == 'mdsdev':
2096 elif type == 'mountpoint':
2098 elif type == 'echoclient':
2100 elif type == 'mgmt':
2103 panic ("unknown service type:", type)
2107 # Prepare the system to run lustre using a particular profile
2108 # in a the configuration.
2109 # * load & the modules
2110 # * setup networking for the current node
2111 # * make sure partitions are in place and prepared
2112 # * initialize devices with lctl
2113 # Levels is important, and needs to be enforced.
2114 def for_each_profile(db, prof_list, operation):
2115 for prof_uuid in prof_list:
2116 prof_db = db.lookup(prof_uuid)
2118 panic("profile:", profile, "not found.")
2119 services = getServices(prof_db)
2122 def doSetup(services):
2126 n = newService(s[1])
2129 def doModules(services):
2133 n = newService(s[1])
2136 def doCleanup(services):
2141 n = newService(s[1])
2142 if n.safe_to_clean():
2145 def doUnloadModules(services):
2150 n = newService(s[1])
2151 if n.safe_to_clean_modules():
2156 def doHost(lustreDB, hosts):
2157 global is_router, local_node_name
2160 node_db = lustreDB.lookup_name(h, 'node')
2164 print 'No host entry found.'
2167 local_node_name = node_db.get_val('name', 0)
2168 is_router = node_db.get_val_int('router', 0)
2169 lustre_upcall = node_db.get_val('lustreUpcall', '')
2170 portals_upcall = node_db.get_val('portalsUpcall', '')
2171 timeout = node_db.get_val_int('timeout', 0)
2173 find_local_clusters(node_db)
2175 find_local_routes(lustreDB)
2177 # Two step process: (1) load modules, (2) setup lustre
2178 # if not cleaning, load modules first.
2179 prof_list = node_db.get_refs('profile')
2181 if config.write_conf:
2182 for_each_profile(node_db, prof_list, doModules)
2184 for node_db in lustreDB.lookup_class('mdsdev'):
2185 mds = MDSDEV(node_db)
2187 for_each_profile(node_db, prof_list, doUnloadModules)
2189 elif config.recover:
2190 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2191 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2192 "--client_uuid <UUID> --conn_uuid <UUID>")
2193 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2195 elif config.cleanup:
2197 # the command line can override this value
2199 # ugly hack, only need to run lctl commands for --dump
2200 if config.lctl_dump or config.record:
2201 for_each_profile(node_db, prof_list, doCleanup)
2204 sys_set_timeout(timeout)
2207 sys_set_lustre_upcall(lustre_upcall)
2208 sys_set_portals_upcall(portals_upcall)
2210 for_each_profile(node_db, prof_list, doCleanup)
2211 for_each_profile(node_db, prof_list, doUnloadModules)
2214 # ugly hack, only need to run lctl commands for --dump
2215 if config.lctl_dump or config.record:
2216 sys_set_timeout(timeout)
2217 sys_set_lustre_upcall(lustre_upcall)
2218 for_each_profile(node_db, prof_list, doSetup)
2222 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2223 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2225 for_each_profile(node_db, prof_list, doModules)
2227 sys_set_debug_path()
2230 script = config.gdb_script
2231 run(lctl.lctl, ' modules >', script)
2233 log ("The GDB module script is in", script)
2234 # pause, so user has time to break and
2237 sys_set_timeout(timeout)
2238 sys_set_lustre_upcall(lustre_upcall)
2239 sys_set_portals_upcall(portals_upcall)
2241 for_each_profile(node_db, prof_list, doSetup)
2243 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2244 tgt = db.lookup(tgt_uuid)
2246 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2247 new_uuid = get_active_target(tgt)
2249 raise Lustre.LconfError("doRecovery: no active target found for: " +
2251 net = choose_local_server(get_ost_net(db, new_uuid))
2253 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2255 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2257 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2259 lctl.disconnect(oldnet)
2260 except CommandError, e:
2261 log("recover: disconnect", nid_uuid, "failed: ")
2266 except CommandError, e:
2267 log("recover: connect failed")
2270 lctl.recover(client_uuid, net.nid_uuid)
2273 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2274 base = os.path.dirname(cmd)
2275 if development_mode():
2276 if not config.lustre:
2277 config.lustre = (os.path.join(base, ".."))
2278 # normalize the portals dir, using command line arg if set
2280 portals_dir = config.portals
2281 dir = os.path.join(config.lustre, portals_dir)
2282 config.portals = dir
2283 debug('config.portals', config.portals)
2284 elif config.lustre and config.portals:
2286 # if --lustre and --portals, normalize portals
2287 # can ignore POTRALS_DIR here, since it is probly useless here
2288 config.portals = os.path.join(config.lustre, config.portals)
2289 debug('config.portals B', config.portals)
2291 def sysctl(path, val):
2292 debug("+ sysctl", path, val)
2296 fp = open(os.path.join('/proc/sys', path), 'w')
2303 def sys_set_debug_path():
2304 sysctl('portals/debug_path', config.debug_path)
2306 def sys_set_lustre_upcall(upcall):
2307 # the command overrides the value in the node config
2308 if config.lustre_upcall:
2309 upcall = config.lustre_upcall
2311 upcall = config.upcall
2313 lctl.set_lustre_upcall(upcall)
2315 def sys_set_portals_upcall(upcall):
2316 # the command overrides the value in the node config
2317 if config.portals_upcall:
2318 upcall = config.portals_upcall
2320 upcall = config.upcall
2322 sysctl('portals/upcall', upcall)
2324 def sys_set_timeout(timeout):
2325 # the command overrides the value in the node config
2326 if config.timeout and config.timeout > 0:
2327 timeout = config.timeout
2328 if timeout != None and timeout > 0:
2329 lctl.set_timeout(timeout)
2331 def sys_tweak_socknal ():
2332 if config.single_socket:
2333 sysctl("socknal/typed", 0)
2335 def sys_optimize_elan ():
2336 run ("echo 0 > /proc/elan/config/eventint_punt_loops")
2338 def sys_set_ptldebug():
2339 if config.ptldebug != None:
2341 val = eval(config.ptldebug, ptldebug_names)
2342 val = "0x%x" % (val,)
2343 sysctl('portals/debug', val)
2344 except NameError, e:
2347 def sys_set_subsystem():
2348 if config.subsystem != None:
2350 val = eval(config.subsystem, subsystem_names)
2351 val = "0x%x" % (val,)
2352 sysctl('portals/subsystem_debug', val)
2353 except NameError, e:
2356 def sys_set_netmem_max(path, max):
2357 debug("setting", path, "to at least", max)
2365 fp = open(path, 'w')
2366 fp.write('%d\n' %(max))
2370 def sys_make_devices():
2371 if not os.access('/dev/portals', os.R_OK):
2372 run('mknod /dev/portals c 10 240')
2373 if not os.access('/dev/obd', os.R_OK):
2374 run('mknod /dev/obd c 10 241')
2377 # Add dir to the global PATH, if not already there.
2378 def add_to_path(new_dir):
2379 syspath = string.split(os.environ['PATH'], ':')
2380 if new_dir in syspath:
2382 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2384 def default_debug_path():
2385 path = '/tmp/lustre-log'
2386 if os.path.isdir('/r'):
2391 def default_gdb_script():
2392 script = '/tmp/ogdb'
2393 if os.path.isdir('/r'):
2394 return '/r' + script
2399 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2400 # ensure basic elements are in the system path
2401 def sanitise_path():
2402 for dir in DEFAULT_PATH:
2405 # global hack for the --select handling
2407 def init_select(args):
2408 # args = [service=nodeA,service2=nodeB service3=nodeC]
2411 list = string.split(arg, ',')
2413 srv, node = string.split(entry, '=')
2414 tgt_select[srv] = node
2416 def get_select(srv):
2417 if tgt_select.has_key(srv):
2418 return tgt_select[srv]
2422 FLAG = Lustre.Options.FLAG
2423 PARAM = Lustre.Options.PARAM
2424 INTPARAM = Lustre.Options.INTPARAM
2425 PARAMLIST = Lustre.Options.PARAMLIST
2427 ('verbose,v', "Print system commands as they are run"),
2428 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2429 ('config', "Cluster config name used for LDAP query", PARAM),
2430 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2431 ('node', "Load config for <nodename>", PARAM),
2432 ('cleanup,d', "Cleans up config. (Shutdown)"),
2433 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2435 ('single_socket', "socknal option: only use one socket instead of bundle",
2437 ('failover',"""Used to shut down without saving state.
2438 This will allow this node to "give up" a service to a
2439 another node for failover purposes. This will not
2440 be a clean shutdown.""",
2442 ('gdb', """Prints message after creating gdb module script
2443 and sleeps for 5 seconds."""),
2444 ('noexec,n', """Prints the commands and steps that will be run for a
2445 config without executing them. This can used to check if a
2446 config file is doing what it should be doing"""),
2447 ('nomod', "Skip load/unload module step."),
2448 ('nosetup', "Skip device setup/cleanup step."),
2449 ('reformat', "Reformat all devices (without question)"),
2450 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2451 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2453 ('write_conf', "Save all the client config information on mds."),
2454 ('record', "Write config information on mds."),
2455 ('record_log', "Name of config record log.", PARAM),
2456 ('record_device', "MDS device name that will record the config commands",
2458 ('minlevel', "Minimum level of services to configure/cleanup",
2460 ('maxlevel', """Maximum level of services to configure/cleanup
2461 Levels are aproximatly like:
2466 70 - mountpoint, echo_client, osc, mdc, lov""",
2468 ('lustre', """Base directory of lustre sources. This parameter will
2469 cause lconf to load modules from a source tree.""", PARAM),
2470 ('portals', """Portals source directory. If this is a relative path,
2471 then it is assumed to be relative to lustre. """, PARAM),
2472 ('timeout', "Set recovery timeout", INTPARAM),
2473 ('upcall', "Set both portals and lustre upcall script", PARAM),
2474 ('lustre_upcall', "Set lustre upcall script", PARAM),
2475 ('portals_upcall', "Set portals upcall script", PARAM),
2476 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2477 ('ptldebug', "Set the portals debug level", PARAM),
2478 ('subsystem', "Set the portals debug subsystem", PARAM),
2479 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2480 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2481 # Client recovery options
2482 ('recover', "Recover a device"),
2483 ('group', "The group of devices to configure or cleanup", PARAM),
2484 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2485 ('client_uuid', "The failed client (required for recovery)", PARAM),
2486 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2488 ('inactive', """The name of an inactive service, to be ignored during
2489 mounting (currently OST-only). Can be repeated.""",
2494 global lctl, config, toplevel, CONFIG_FILE
2496 # in the upcall this is set to SIG_IGN
2497 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2499 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2501 config, args = cl.parse(sys.argv[1:])
2502 except Lustre.OptionError, e:
2506 setupModulePath(sys.argv[0])
2508 host = socket.gethostname()
2510 # the PRNG is normally seeded with time(), which is not so good for starting
2511 # time-synchronized clusters
2512 input = open('/dev/urandom', 'r')
2514 print 'Unable to open /dev/urandom!'
2516 seed = input.read(32)
2522 init_select(config.select)
2525 if not os.access(args[0], os.R_OK):
2526 print 'File not found or readable:', args[0]
2529 dom = xml.dom.minidom.parse(args[0])
2531 panic("%s does not appear to be a config file." % (args[0]))
2532 sys.exit(1) # make sure to die here, even in debug mode.
2533 CONFIG_FILE = args[0]
2534 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2535 if not config.config:
2536 config.config = os.path.basename(args[0])# use full path?
2537 if config.config[-4:] == '.xml':
2538 config.config = config.config[:-4]
2539 elif config.ldapurl:
2540 if not config.config:
2541 panic("--ldapurl requires --config name")
2542 dn = "config=%s,fs=lustre" % (config.config)
2543 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2545 print 'Missing config file or ldap URL.'
2546 print 'see lconf --help for command summary'
2551 ver = db.get_version()
2553 panic("No version found in config data, please recreate.")
2554 if ver != Lustre.CONFIG_VERSION:
2555 panic("Config version", ver, "does not match lconf version",
2556 Lustre.CONFIG_VERSION)
2560 node_list.append(config.node)
2563 node_list.append(host)
2564 node_list.append('localhost')
2566 debug("configuring for host: ", node_list)
2569 config.debug_path = config.debug_path + '-' + host
2570 config.gdb_script = config.gdb_script + '-' + host
2572 lctl = LCTLInterface('lctl')
2574 if config.lctl_dump:
2575 lctl.use_save_file(config.lctl_dump)
2578 if not (config.record_device and config.record_log):
2579 panic("When recording, both --record_log and --record_device must be specified.")
2580 lctl.record(config.record_device, config.record_log)
2582 doHost(db, node_list)
2587 if __name__ == "__main__":
2590 except Lustre.LconfError, e:
2592 # traceback.print_exc(file=sys.stdout)
2594 except CommandError, e:
2598 if first_cleanup_error:
2599 sys.exit(first_cleanup_error)