3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
360 # From "Python Cookbook" from O'Reilly
361 outfile = child.fromchild
362 outfd = outfile.fileno()
363 self.set_nonblock(outfd)
364 errfile = child.childerr
365 errfd = errfile.fileno()
366 self.set_nonblock(errfd)
368 outdata = errdata = ''
371 ready = select.select([outfd,errfd],[],[]) # Wait for input
372 if outfd in ready[0]:
373 outchunk = outfile.read()
374 if outchunk == '': outeof = 1
375 outdata = outdata + outchunk
376 if errfd in ready[0]:
377 errchunk = errfile.read()
378 if errchunk == '': erreof = 1
379 errdata = errdata + errchunk
380 if outeof and erreof: break
381 # end of "borrowed" code
384 if os.WIFEXITED(ret):
385 rc = os.WEXITSTATUS(ret)
388 if rc or len(errdata):
389 raise CommandError(self.lctl, errdata, rc)
392 def runcmd(self, *args):
394 run lctl using the command line
396 cmd = string.join(map(str,args))
397 debug("+", self.lctl, cmd)
398 rc, out = run(self.lctl, cmd)
400 raise CommandError(self.lctl, out, rc)
404 def clear_log(self, dev, log):
405 """ clear an existing log """
410 quit """ % (dev, log)
413 def network(self, net, nid):
418 quit """ % (net, nid)
421 # create a new connection
422 def add_uuid(self, net_type, uuid, nid):
423 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
426 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
428 if net_type in ('tcp',) and not config.lctl_dump:
433 add_autoconn %s %s %d %s
437 nid, hostaddr, port, flags )
440 def connect(self, srv):
441 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
442 if srv.net_type in ('tcp',) and not config.lctl_dump:
446 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
447 srv.nid, srv.hostaddr, srv.port, flags)
450 def recover(self, dev_name, new_conn):
453 recover %s""" %(dev_name, new_conn)
456 # add a route to a range
457 def add_route(self, net, gw, lo, hi):
465 except CommandError, e:
469 def del_route(self, net, gw, lo, hi):
474 quit """ % (net, gw, lo, hi)
477 # add a route to a host
478 def add_route_host(self, net, uuid, gw, tgt):
479 self.add_uuid(net, uuid, tgt)
487 except CommandError, e:
491 # add a route to a range
492 def del_route_host(self, net, uuid, gw, tgt):
498 quit """ % (net, gw, tgt)
502 def del_autoconn(self, net_type, nid, hostaddr):
503 if net_type in ('tcp',) and not config.lctl_dump:
512 # disconnect one connection
513 def disconnect(self, srv):
514 self.del_uuid(srv.nid_uuid)
515 if srv.net_type in ('tcp',) and not config.lctl_dump:
516 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
518 def del_uuid(self, uuid):
526 def disconnectAll(self, net):
534 def attach(self, type, name, uuid):
537 quit""" % (type, name, uuid)
540 def setup(self, name, setup = ""):
544 quit""" % (name, setup)
548 # create a new device with lctl
549 def newdev(self, type, name, uuid, setup = ""):
550 self.attach(type, name, uuid);
552 self.setup(name, setup)
553 except CommandError, e:
554 self.cleanup(name, uuid, 0)
559 def cleanup(self, name, uuid, force, failover = 0):
560 if failover: force = 1
566 quit""" % (name, ('', 'force')[force],
567 ('', 'failover')[failover])
571 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
572 stripe_sz, stripe_off,
576 lov_setup %s %d %d %d %s %s
577 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
586 lov_setconfig %s %d %d %d %s %s
587 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
591 def dump(self, dump_file):
594 quit""" % (dump_file)
597 # get list of devices
598 def device_list(self):
599 devices = '/proc/fs/lustre/devices'
601 if os.access(devices, os.R_OK):
603 fp = open(devices, 'r')
611 def lustre_version(self):
612 rc, out = self.runcmd('version')
616 def mount_option(self, profile, osc, mdc):
618 mount_option %s %s %s
619 quit""" % (profile, osc, mdc)
622 # delete mount options
623 def del_mount_option(self, profile):
629 def set_timeout(self, timeout):
635 # delete mount options
636 def set_lustre_upcall(self, upcall):
641 # ============================================================
642 # Various system-level functions
643 # (ideally moved to their own module)
645 # Run a command and return the output and status.
646 # stderr is sent to /dev/null, could use popen3 to
647 # save it if necessary
650 if config.noexec: return (0, [])
651 f = os.popen(cmd + ' 2>&1')
661 cmd = string.join(map(str,args))
664 # Run a command in the background.
665 def run_daemon(*args):
666 cmd = string.join(map(str,args))
668 if config.noexec: return 0
669 f = os.popen(cmd + ' 2>&1')
677 # Determine full path to use for an external command
678 # searches dirname(argv[0]) first, then PATH
680 syspath = string.split(os.environ['PATH'], ':')
681 cmdpath = os.path.dirname(sys.argv[0])
682 syspath.insert(0, cmdpath);
684 syspath.insert(0, os.path.join(config.portals, 'utils/'))
686 prog = os.path.join(d,cmd)
687 if os.access(prog, os.X_OK):
691 # Recursively look for file starting at base dir
692 def do_find_file(base, mod):
693 fullname = os.path.join(base, mod)
694 if os.access(fullname, os.R_OK):
696 for d in os.listdir(base):
697 dir = os.path.join(base,d)
698 if os.path.isdir(dir):
699 module = do_find_file(dir, mod)
703 def find_module(src_dir, dev_dir, modname):
704 modbase = src_dir +'/'+ dev_dir +'/'+ modname
705 for modext in '.ko', '.o':
706 module = modbase + modext
708 if os.access(module, os.R_OK):
714 # is the path a block device?
721 return stat.S_ISBLK(s[stat.ST_MODE])
723 # build fs according to type
725 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
731 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
733 # devsize is in 1k, and fs block count is in 4k
734 block_cnt = devsize/4
736 if fstype in ('ext3', 'extN', 'ldiskfs'):
737 # ext3 journal size is in megabytes
740 if not is_block(dev):
741 ret, out = runcmd("ls -l %s" %dev)
742 devsize = int(string.split(out[0])[4]) / 1024
744 ret, out = runcmd("sfdisk -s %s" %dev)
745 devsize = int(out[0])
746 if devsize > 1024 * 1024:
747 jsize = ((devsize / 102400) * 4)
750 if jsize: jopt = "-J size=%d" %(jsize,)
751 if isize: iopt = "-I %d" %(isize,)
752 mkfs = 'mkfs.ext2 -j -b 4096 '
753 if not isblock or config.force:
755 elif fstype == 'reiserfs':
756 # reiserfs journal size is in blocks
757 if jsize: jopt = "--journal_size %d" %(jsize,)
758 mkfs = 'mkreiserfs -ff'
760 panic('unsupported fs type: ', fstype)
762 if config.mkfsoptions != None:
763 mkfs = mkfs + ' ' + config.mkfsoptions
764 if mkfsoptions != None:
765 mkfs = mkfs + ' ' + mkfsoptions
766 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
768 panic("Unable to build fs:", dev, string.join(out))
769 # enable hash tree indexing on fsswe
770 if fstype in ('ext3', 'extN', 'ldiskfs'):
771 htree = 'echo "feature FEATURE_C5" | debugfs -w'
772 (ret, out) = run (htree, dev)
774 panic("Unable to enable htree:", dev)
776 # some systems use /dev/loopN, some /dev/loop/N
780 if not os.access(loop + str(0), os.R_OK):
782 if not os.access(loop + str(0), os.R_OK):
783 panic ("can't access loop devices")
786 # find loop device assigned to thefile
789 for n in xrange(0, MAX_LOOP_DEVICES):
791 if os.access(dev, os.R_OK):
792 (stat, out) = run('losetup', dev)
793 if out and stat == 0:
794 m = re.search(r'\((.*)\)', out[0])
795 if m and file == m.group(1):
801 # create file if necessary and assign the first free loop device
802 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
803 dev = find_loop(file)
805 print 'WARNING file:', file, 'already mapped to', dev
807 if reformat or not os.access(file, os.R_OK | os.W_OK):
809 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
810 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
813 panic("Unable to create backing store:", file)
814 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
817 # find next free loop
818 for n in xrange(0, MAX_LOOP_DEVICES):
820 if os.access(dev, os.R_OK):
821 (stat, out) = run('losetup', dev)
823 run('losetup', dev, file)
826 print "out of loop devices"
828 print "out of loop devices"
831 # undo loop assignment
832 def clean_loop(file):
833 dev = find_loop(file)
835 ret, out = run('losetup -d', dev)
837 log('unable to clean loop device:', dev, 'for file:', file)
840 # determine if dev is formatted as a <fstype> filesystem
841 def need_format(fstype, dev):
842 # FIXME don't know how to implement this
845 # initialize a block device if needed
846 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
847 inode_size, mkfsoptions):
848 if config.noexec: return dev
849 if not is_block(dev):
850 dev = init_loop(dev, size, fstype, journal_size, inode_size,
851 mkfsoptions, reformat)
852 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
853 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
856 # panic("device:", dev,
857 # "not prepared, and autoformat is not set.\n",
858 # "Rerun with --reformat option to format ALL filesystems")
863 """lookup IP address for an interface"""
864 rc, out = run("/sbin/ifconfig", iface)
867 addr = string.split(out[1])[1]
868 ip = string.split(addr, ':')[1]
871 def sys_get_elan_position_file():
872 procfiles = ["/proc/elan/device0/position",
873 "/proc/qsnet/elan4/device0/position",
874 "/proc/qsnet/elan3/device0/position"]
876 if os.access(p, os.R_OK):
880 def sys_get_local_nid(net_type, wildcard, cluster_id):
881 """Return the local nid."""
883 if sys_get_elan_position_file():
884 local = sys_get_local_address('elan', '*', cluster_id)
886 local = sys_get_local_address(net_type, wildcard, cluster_id)
889 def sys_get_local_address(net_type, wildcard, cluster_id):
890 """Return the local address for the network type."""
892 if net_type in ('tcp',):
894 iface, star = string.split(wildcard, ':')
895 local = if2addr(iface)
897 panic ("unable to determine ip for:", wildcard)
899 host = socket.gethostname()
900 local = socket.gethostbyname(host)
901 elif net_type == 'elan':
902 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
903 f = sys_get_elan_position_file()
905 panic ("unable to determine local Elan ID")
908 lines = fp.readlines()
916 nid = my_int(cluster_id) + my_int(elan_id)
918 except ValueError, e:
922 elif net_type == 'gm':
923 fixme("automatic local address for GM")
927 def mod_loaded(modname):
928 """Check if a module is already loaded. Look in /proc/modules for it."""
930 fp = open('/proc/modules')
931 lines = fp.readlines()
933 # please forgive my tired fingers for this one
934 ret = filter(lambda word, mod=modname: word == mod,
935 map(lambda line: string.split(line)[0], lines))
940 # XXX: instead of device_list, ask for $name and see what we get
941 def is_prepared(name):
942 """Return true if a device exists for the name"""
945 if (config.noexec or config.record) and config.cleanup:
948 # expect this format:
949 # 1 UP ldlm ldlm ldlm_UUID 2
950 out = lctl.device_list()
952 if name == string.split(s)[3]:
954 except CommandError, e:
958 def is_network_prepared():
959 """If the any device exists, then assume that all networking
960 has been configured"""
961 out = lctl.device_list()
964 def fs_is_mounted(path):
965 """Return true if path is a mounted lustre filesystem"""
967 fp = open('/proc/mounts')
968 lines = fp.readlines()
972 if a[1] == path and a[2] == 'lustre_lite':
980 """Manage kernel modules"""
981 def __init__(self, lustre_dir, portals_dir):
982 self.lustre_dir = lustre_dir
983 self.portals_dir = portals_dir
984 self.kmodule_list = []
986 def add_portals_module(self, dev_dir, modname):
987 """Append a module to list of modules to load."""
988 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
990 def add_lustre_module(self, dev_dir, modname):
991 """Append a module to list of modules to load."""
992 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
994 def load_module(self):
995 """Load all the modules in the list in the order they appear."""
996 for src_dir, dev_dir, mod in self.kmodule_list:
997 if mod_loaded(mod) and not config.noexec:
999 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1001 module = find_module(src_dir, dev_dir, mod)
1003 panic('module not found:', mod)
1004 (rc, out) = run('/sbin/insmod', module)
1006 raise CommandError('insmod', out, rc)
1008 (rc, out) = run('/sbin/modprobe', mod)
1010 raise CommandError('modprobe', out, rc)
1012 def cleanup_module(self):
1013 """Unload the modules in the list in reverse order."""
1014 rev = self.kmodule_list
1016 for src_dir, dev_dir, mod in rev:
1017 if not mod_loaded(mod) and not config.noexec:
1020 if mod == 'portals' and config.dump:
1021 lctl.dump(config.dump)
1022 log('unloading module:', mod)
1023 (rc, out) = run('/sbin/rmmod', mod)
1025 log('! unable to unload module:', mod)
1028 # ============================================================
1029 # Classes to prepare and cleanup the various objects
1032 """ Base class for the rest of the modules. The default cleanup method is
1033 defined here, as well as some utilitiy funcs.
1035 def __init__(self, module_name, db):
1037 self.module_name = module_name
1038 self.name = self.db.getName()
1039 self.uuid = self.db.getUUID()
1042 self.kmod = kmod(config.lustre, config.portals)
1044 def info(self, *args):
1045 msg = string.join(map(str,args))
1046 print self.module_name + ":", self.name, self.uuid, msg
1049 """ default cleanup, used for most modules """
1052 lctl.cleanup(self.name, self.uuid, config.force)
1053 except CommandError, e:
1054 log(self.module_name, "cleanup failed: ", self.name)
1058 def add_portals_module(self, dev_dir, modname):
1059 """Append a module to list of modules to load."""
1060 self.kmod.add_portals_module(dev_dir, modname)
1062 def add_lustre_module(self, dev_dir, modname):
1063 """Append a module to list of modules to load."""
1064 self.kmod.add_lustre_module(dev_dir, modname)
1066 def load_module(self):
1067 """Load all the modules in the list in the order they appear."""
1068 self.kmod.load_module()
1070 def cleanup_module(self):
1071 """Unload the modules in the list in reverse order."""
1072 if self.safe_to_clean():
1073 self.kmod.cleanup_module()
1075 def safe_to_clean(self):
1078 def safe_to_clean_modules(self):
1079 return self.safe_to_clean()
1081 class Network(Module):
1082 def __init__(self,db):
1083 Module.__init__(self, 'NETWORK', db)
1084 self.net_type = self.db.get_val('nettype')
1085 self.nid = self.db.get_val('nid', '*')
1086 self.cluster_id = self.db.get_val('clusterid', "0")
1087 self.port = self.db.get_val_int('port', 0)
1088 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1089 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1090 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1093 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1095 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1096 self.generic_nid = 1
1097 debug("nid:", self.nid)
1099 self.generic_nid = 0
1101 self.nid_uuid = self.nid_to_uuid(self.nid)
1103 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1104 if '*' in self.hostaddr:
1105 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1106 if not self.hostaddr:
1107 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1108 debug("hostaddr:", self.hostaddr)
1110 self.add_portals_module("libcfs", 'libcfs')
1111 self.add_portals_module("portals", 'portals')
1112 if node_needs_router():
1113 self.add_portals_module("router", 'kptlrouter')
1114 if self.net_type == 'tcp':
1115 self.add_portals_module("knals/socknal", 'ksocknal')
1116 if self.net_type == 'elan':
1117 self.add_portals_module("knals/qswnal", 'kqswnal')
1118 if self.net_type == 'gm':
1119 self.add_portals_module("knals/gmnal", 'kgmnal')
1121 def nid_to_uuid(self, nid):
1122 return "NID_%s_UUID" %(nid,)
1125 if is_network_prepared():
1127 self.info(self.net_type, self.nid, self.port)
1128 if not (config.record and self.generic_nid):
1129 lctl.network(self.net_type, self.nid)
1130 if self.net_type == 'tcp':
1132 if self.net_type == 'elan':
1134 if self.port and node_is_router():
1135 run_one_acceptor(self.port)
1136 self.connect_peer_gateways()
1138 def connect_peer_gateways(self):
1139 for router in self.db.lookup_class('node'):
1140 if router.get_val_int('router', 0):
1141 for netuuid in router.get_networks():
1142 net = self.db.lookup(netuuid)
1144 if (gw.cluster_id == self.cluster_id and
1145 gw.net_type == self.net_type):
1146 if gw.nid != self.nid:
1149 def disconnect_peer_gateways(self):
1150 for router in self.db.lookup_class('node'):
1151 if router.get_val_int('router', 0):
1152 for netuuid in router.get_networks():
1153 net = self.db.lookup(netuuid)
1155 if (gw.cluster_id == self.cluster_id and
1156 gw.net_type == self.net_type):
1157 if gw.nid != self.nid:
1160 except CommandError, e:
1161 print "disconnect failed: ", self.name
1165 def safe_to_clean(self):
1166 return not is_network_prepared()
1169 self.info(self.net_type, self.nid, self.port)
1171 stop_acceptor(self.port)
1172 if node_is_router():
1173 self.disconnect_peer_gateways()
1175 class RouteTable(Module):
1176 def __init__(self,db):
1177 Module.__init__(self, 'ROUTES', db)
1179 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1181 # only setup connections for tcp NALs
1183 if not net_type in ('tcp',):
1186 # connect to target if route is to single node and this node is the gw
1187 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1188 if not local_cluster(net_type, tgt_cluster_id):
1189 panic("target", lo, " not on the local cluster")
1190 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1191 # connect to gateway if this node is not the gw
1192 elif (local_cluster(net_type, gw_cluster_id)
1193 and not local_interface(net_type, gw_cluster_id, gw)):
1194 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1199 panic("no server for nid", lo)
1202 return Network(srvdb)
1205 if is_network_prepared():
1208 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1209 lctl.add_route(net_type, gw, lo, hi)
1210 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1214 def safe_to_clean(self):
1215 return not is_network_prepared()
1218 if is_network_prepared():
1219 # the network is still being used, don't clean it up
1221 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1222 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1225 lctl.disconnect(srv)
1226 except CommandError, e:
1227 print "disconnect failed: ", self.name
1232 lctl.del_route(net_type, gw, lo, hi)
1233 except CommandError, e:
1234 print "del_route failed: ", self.name
1238 class Management(Module):
1239 def __init__(self, db):
1240 Module.__init__(self, 'MGMT', db)
1241 self.add_lustre_module('lvfs', 'lvfs')
1242 self.add_lustre_module('obdclass', 'obdclass')
1243 self.add_lustre_module('ptlrpc', 'ptlrpc')
1244 self.add_lustre_module('mgmt', 'mgmt_svc')
1247 if is_prepared(self.name):
1250 lctl.newdev("mgmt", self.name, self.uuid)
1252 def safe_to_clean(self):
1256 if is_prepared(self.name):
1257 Module.cleanup(self)
1259 # This is only needed to load the modules; the LDLM device
1260 # is now created automatically.
1262 def __init__(self,db):
1263 Module.__init__(self, 'LDLM', db)
1264 self.add_lustre_module('lvfs', 'lvfs')
1265 self.add_lustre_module('obdclass', 'obdclass')
1266 self.add_lustre_module('ptlrpc', 'ptlrpc')
1275 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1276 Module.__init__(self, 'LOV', db)
1277 if name_override != None:
1278 self.name = "lov_%s" % name_override
1279 self.add_lustre_module('lov', 'lov')
1280 self.mds_uuid = self.db.get_first_ref('mds')
1281 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1282 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1283 self.pattern = self.db.get_val_int('stripepattern', 0)
1284 self.devlist = self.db.get_refs('obd')
1285 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1287 self.desc_uuid = self.uuid
1288 self.uuid = generate_client_uuid(self.name)
1289 self.fs_name = fs_name
1291 self.config_only = 1
1293 self.config_only = None
1294 mds= self.db.lookup(self.mds_uuid)
1295 self.mds_name = mds.getName()
1296 for obd_uuid in self.devlist:
1297 obd = self.db.lookup(obd_uuid)
1298 osc = get_osc(obd, self.uuid, fs_name)
1300 self.osclist.append(osc)
1302 panic('osc not found:', obd_uuid)
1305 if is_prepared(self.name):
1307 if self.config_only:
1308 panic("Can't prepare config_only LOV ", self.name)
1310 for osc in self.osclist:
1312 # Only ignore connect failures with --force, which
1313 # isn't implemented here yet.
1314 osc.prepare(ignore_connect_failure=0)
1315 except CommandError, e:
1316 print "Error preparing OSC %s\n" % osc.uuid
1318 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1319 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1320 lctl.lov_setup(self.name, self.uuid,
1321 self.desc_uuid, self.mds_name, self.stripe_cnt,
1322 self.stripe_sz, self.stripe_off, self.pattern,
1323 string.join(self.devlist))
1326 if is_prepared(self.name):
1327 Module.cleanup(self)
1328 if self.config_only:
1329 panic("Can't clean up config_only LOV ", self.name)
1330 for osc in self.osclist:
1333 def load_module(self):
1334 if self.config_only:
1335 panic("Can't load modules for config_only LOV ", self.name)
1336 for osc in self.osclist:
1339 Module.load_module(self)
1341 def cleanup_module(self):
1342 if self.config_only:
1343 panic("Can't cleanup modules for config_only LOV ", self.name)
1344 Module.cleanup_module(self)
1345 for osc in self.osclist:
1346 osc.cleanup_module()
1349 class MDSDEV(Module):
1350 def __init__(self,db):
1351 Module.__init__(self, 'MDSDEV', db)
1352 self.devpath = self.db.get_val('devpath','')
1353 self.size = self.db.get_val_int('devsize', 0)
1354 self.journal_size = self.db.get_val_int('journalsize', 0)
1355 self.fstype = self.db.get_val('fstype', '')
1356 self.nspath = self.db.get_val('nspath', '')
1357 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1358 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1359 target_uuid = self.db.get_first_ref('target')
1360 mds = self.db.lookup(target_uuid)
1361 self.name = mds.getName()
1362 self.filesystem_uuids = mds.get_refs('filesystem')
1363 # FIXME: if fstype not set, then determine based on kernel version
1364 self.format = self.db.get_val('autoformat', "no")
1365 if mds.get_val('failover', 0):
1366 self.failover_mds = 'f'
1368 self.failover_mds = 'n'
1369 active_uuid = get_active_target(mds)
1371 panic("No target device found:", target_uuid)
1372 if active_uuid == self.uuid:
1376 if self.active and config.group and config.group != mds.get_val('group'):
1379 self.inode_size = self.db.get_val_int('inodesize', 0)
1380 if self.inode_size == 0:
1381 # find the LOV for this MDS
1382 lovconfig_uuid = mds.get_first_ref('lovconfig')
1383 if not lovconfig_uuid:
1384 panic("No LOV config found for MDS ", mds.name)
1385 lovconfig = mds.lookup(lovconfig_uuid)
1386 lov_uuid = lovconfig.get_first_ref('lov')
1388 panic("No LOV found for lovconfig ", lovconfig.name)
1389 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1391 # default stripe count controls default inode_size
1392 stripe_count = lov.stripe_cnt
1393 if stripe_count > 77:
1394 self.inode_size = 4096
1395 elif stripe_count > 35:
1396 self.inode_size = 2048
1397 elif stripe_count > 13:
1398 self.inode_size = 1024
1399 elif stripe_count > 3:
1400 self.inode_size = 512
1402 self.inode_size = 256
1404 self.target_dev_uuid = self.uuid
1405 self.uuid = target_uuid
1407 self.add_lustre_module('mdc', 'mdc')
1408 self.add_lustre_module('osc', 'osc')
1409 self.add_lustre_module('lov', 'lov')
1410 self.add_lustre_module('mds', 'mds')
1411 if self.fstype == 'ldiskfs':
1412 self.add_lustre_module('ldiskfs', 'ldiskfs')
1414 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1416 def load_module(self):
1418 Module.load_module(self)
1421 if is_prepared(self.name):
1424 debug(self.uuid, "not active")
1427 # run write_conf automatically, if --reformat used
1429 self.info(self.devpath, self.fstype, self.size, self.format)
1431 # never reformat here
1432 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1433 self.format, self.journal_size, self.inode_size,
1435 if not is_prepared('MDT'):
1436 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1438 lctl.newdev("mds", self.name, self.uuid,
1439 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1440 except CommandError, e:
1442 panic("MDS is missing the config log. Need to run " +
1443 "lconf --write_conf.")
1447 def write_conf(self):
1448 if is_prepared(self.name):
1450 self.info(self.devpath, self.fstype, self.format)
1451 blkdev = block_dev(self.devpath, self.size, self.fstype,
1452 config.reformat, self.format, self.journal_size,
1453 self.inode_size, self.mkfsoptions)
1454 lctl.newdev("mds", self.name, self.uuid,
1455 setup ="%s %s" %(blkdev, self.fstype))
1457 # record logs for the MDS lov
1458 for uuid in self.filesystem_uuids:
1459 log("recording clients for filesystem:", uuid)
1460 fs = self.db.lookup(uuid)
1461 obd_uuid = fs.get_first_ref('obd')
1462 client_uuid = generate_client_uuid(self.name)
1463 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1466 lctl.clear_log(self.name, self.name)
1467 lctl.record(self.name, self.name)
1469 lctl.mount_option(self.name, client.get_name(), "")
1473 lctl.clear_log(self.name, self.name + '-clean')
1474 lctl.record(self.name, self.name + '-clean')
1476 lctl.del_mount_option(self.name)
1481 # record logs for each client
1483 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1485 config_options = CONFIG_FILE
1487 for node_db in self.db.lookup_class('node'):
1488 client_name = node_db.getName()
1489 for prof_uuid in node_db.get_refs('profile'):
1490 prof_db = node_db.lookup(prof_uuid)
1491 # refactor this into a funtion to test "clientness"
1493 for ref_class, ref_uuid in prof_db.get_all_refs():
1494 if ref_class in ('mountpoint','echoclient'):
1495 debug("recording", client_name)
1496 old_noexec = config.noexec
1498 noexec_opt = ('', '-n')
1499 ret, out = run (sys.argv[0],
1500 noexec_opt[old_noexec == 1],
1501 " -v --record --nomod",
1502 "--record_log", client_name,
1503 "--record_device", self.name,
1504 "--node", client_name,
1507 for s in out: log("record> ", string.strip(s))
1508 ret, out = run (sys.argv[0],
1509 noexec_opt[old_noexec == 1],
1510 "--cleanup -v --record --nomod",
1511 "--record_log", client_name + "-clean",
1512 "--record_device", self.name,
1513 "--node", client_name,
1516 for s in out: log("record> ", string.strip(s))
1517 config.noexec = old_noexec
1519 lctl.cleanup(self.name, self.uuid, 0, 0)
1520 except CommandError, e:
1521 log(self.module_name, "cleanup failed: ", self.name)
1524 Module.cleanup(self)
1525 clean_loop(self.devpath)
1527 def msd_remaining(self):
1528 out = lctl.device_list()
1530 if string.split(s)[2] in ('mds',):
1533 def safe_to_clean(self):
1536 def safe_to_clean_modules(self):
1537 return not self.msd_remaining()
1541 debug(self.uuid, "not active")
1544 if is_prepared(self.name):
1546 lctl.cleanup(self.name, self.uuid, config.force,
1548 except CommandError, e:
1549 log(self.module_name, "cleanup failed: ", self.name)
1552 Module.cleanup(self)
1553 if not self.msd_remaining() and is_prepared('MDT'):
1555 lctl.cleanup("MDT", "MDT_UUID", config.force,
1557 except CommandError, e:
1558 print "cleanup failed: ", self.name
1561 clean_loop(self.devpath)
1564 def __init__(self, db):
1565 Module.__init__(self, 'OSD', db)
1566 self.osdtype = self.db.get_val('osdtype')
1567 self.devpath = self.db.get_val('devpath', '')
1568 self.size = self.db.get_val_int('devsize', 0)
1569 self.journal_size = self.db.get_val_int('journalsize', 0)
1570 self.inode_size = self.db.get_val_int('inodesize', 0)
1571 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1572 self.fstype = self.db.get_val('fstype', '')
1573 self.nspath = self.db.get_val('nspath', '')
1574 target_uuid = self.db.get_first_ref('target')
1575 ost = self.db.lookup(target_uuid)
1576 self.name = ost.getName()
1577 self.format = self.db.get_val('autoformat', 'yes')
1578 if ost.get_val('failover', 0):
1579 self.failover_ost = 'f'
1581 self.failover_ost = 'n'
1583 active_uuid = get_active_target(ost)
1585 panic("No target device found:", target_uuid)
1586 if active_uuid == self.uuid:
1590 if self.active and config.group and config.group != ost.get_val('group'):
1593 self.target_dev_uuid = self.uuid
1594 self.uuid = target_uuid
1596 self.add_lustre_module('ost', 'ost')
1597 # FIXME: should we default to ext3 here?
1598 if self.fstype == 'ldiskfs':
1599 self.add_lustre_module('ldiskfs', 'ldiskfs')
1601 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1602 self.add_lustre_module(self.osdtype, self.osdtype)
1604 def load_module(self):
1606 Module.load_module(self)
1608 # need to check /proc/mounts and /etc/mtab before
1609 # formatting anything.
1610 # FIXME: check if device is already formatted.
1612 if is_prepared(self.name):
1615 debug(self.uuid, "not active")
1617 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1618 self.format, self.journal_size, self.inode_size)
1620 if self.osdtype == 'obdecho':
1623 blkdev = block_dev(self.devpath, self.size, self.fstype,
1624 config.reformat, self.format, self.journal_size,
1625 self.inode_size, self.mkfsoptions)
1626 lctl.newdev(self.osdtype, self.name, self.uuid,
1627 setup ="%s %s %s" %(blkdev, self.fstype,
1629 if not is_prepared('OSS'):
1630 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1632 def osd_remaining(self):
1633 out = lctl.device_list()
1635 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1638 def safe_to_clean(self):
1641 def safe_to_clean_modules(self):
1642 return not self.osd_remaining()
1646 debug(self.uuid, "not active")
1648 if is_prepared(self.name):
1651 lctl.cleanup(self.name, self.uuid, config.force,
1653 except CommandError, e:
1654 log(self.module_name, "cleanup failed: ", self.name)
1657 if not self.osd_remaining() and is_prepared('OSS'):
1659 lctl.cleanup("OSS", "OSS_UUID", config.force,
1661 except CommandError, e:
1662 print "cleanup failed: ", self.name
1665 if not self.osdtype == 'obdecho':
1666 clean_loop(self.devpath)
1668 def mgmt_uuid_for_fs(mtpt_name):
1671 mtpt_db = toplevel.lookup_name(mtpt_name)
1672 fs_uuid = mtpt_db.get_first_ref('filesystem')
1673 fs = toplevel.lookup(fs_uuid)
1676 return fs.get_first_ref('mgmt')
1678 # Generic client module, used by OSC and MDC
1679 class Client(Module):
1680 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1682 self.target_name = tgtdb.getName()
1683 self.target_uuid = tgtdb.getUUID()
1686 self.tgt_dev_uuid = get_active_target(tgtdb)
1687 if not self.tgt_dev_uuid:
1688 panic("No target device found for target:", self.target_name)
1690 self.kmod = kmod(config.lustre, config.portals)
1694 self.module = module
1695 self.module_name = string.upper(module)
1697 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1698 self.target_name, fs_name)
1700 self.name = self_name
1702 self.lookup_server(self.tgt_dev_uuid)
1703 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1705 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1708 self.fs_name = fs_name
1711 self.add_lustre_module(module_dir, module)
1713 def lookup_server(self, srv_uuid):
1714 """ Lookup a server's network information """
1715 self._server_nets = get_ost_net(self.db, srv_uuid)
1716 if len(self._server_nets) == 0:
1717 panic ("Unable to find a server for:", srv_uuid)
1719 def get_servers(self):
1720 return self._server_nets
1722 def prepare(self, ignore_connect_failure = 0):
1723 self.info(self.target_uuid)
1724 if is_prepared(self.name):
1727 srv = choose_local_server(self.get_servers())
1731 routes = find_route(self.get_servers())
1732 if len(routes) == 0:
1733 panic ("no route to", self.target_uuid)
1734 for (srv, r) in routes:
1735 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1736 except CommandError, e:
1737 if not ignore_connect_failure:
1740 if self.target_uuid in config.inactive and self.permits_inactive():
1741 debug("%s inactive" % self.target_uuid)
1742 inactive_p = "inactive"
1744 debug("%s active" % self.target_uuid)
1746 lctl.newdev(self.module, self.name, self.uuid,
1747 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1748 inactive_p, self.mgmt_name))
1751 if is_prepared(self.name):
1752 Module.cleanup(self)
1754 srv = choose_local_server(self.get_servers())
1756 lctl.disconnect(srv)
1758 for (srv, r) in find_route(self.get_servers()):
1759 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1760 except CommandError, e:
1761 log(self.module_name, "cleanup failed: ", self.name)
1767 def __init__(self, db, uuid, fs_name):
1768 Client.__init__(self, db, uuid, 'mdc', fs_name)
1770 def permits_inactive(self):
1774 def __init__(self, db, uuid, fs_name):
1775 Client.__init__(self, db, uuid, 'osc', fs_name)
1777 def permits_inactive(self):
1780 def mgmtcli_name_for_uuid(uuid):
1781 return 'MGMTCLI_%s' % uuid
1783 class ManagementClient(Client):
1784 def __init__(self, db, uuid):
1785 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1786 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1787 module_dir = 'mgmt')
1790 def __init__(self, db):
1791 Module.__init__(self, 'COBD', db)
1792 self.real_uuid = self.db.get_first_ref('realobd')
1793 self.cache_uuid = self.db.get_first_ref('cacheobd')
1794 self.add_lustre_module('cobd' , 'cobd')
1796 # need to check /proc/mounts and /etc/mtab before
1797 # formatting anything.
1798 # FIXME: check if device is already formatted.
1800 if is_prepared(self.name):
1802 self.info(self.real_uuid, self.cache_uuid)
1803 lctl.newdev("cobd", self.name, self.uuid,
1804 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1807 # virtual interface for OSC and LOV
1809 def __init__(self, db, uuid, fs_name, name_override = None):
1810 Module.__init__(self, 'VOSC', db)
1811 if db.get_class() == 'lov':
1812 self.osc = LOV(db, uuid, fs_name, name_override)
1814 self.osc = get_osc(db, uuid, fs_name)
1816 return self.osc.uuid
1818 return self.osc.name
1823 def load_module(self):
1824 self.osc.load_module()
1825 def cleanup_module(self):
1826 self.osc.cleanup_module()
1829 class ECHO_CLIENT(Module):
1830 def __init__(self,db):
1831 Module.__init__(self, 'ECHO_CLIENT', db)
1832 self.add_lustre_module('obdecho', 'obdecho')
1833 self.obd_uuid = self.db.get_first_ref('obd')
1834 obd = self.db.lookup(self.obd_uuid)
1835 self.uuid = generate_client_uuid(self.name)
1836 self.osc = VOSC(obd, self.uuid, self.name)
1839 if is_prepared(self.name):
1842 self.osc.prepare() # XXX This is so cheating. -p
1843 self.info(self.obd_uuid)
1845 lctl.newdev("echo_client", self.name, self.uuid,
1846 setup = self.osc.get_name())
1849 if is_prepared(self.name):
1850 Module.cleanup(self)
1853 def load_module(self):
1854 self.osc.load_module()
1855 Module.load_module(self)
1857 def cleanup_module(self):
1858 Module.cleanup_module(self)
1859 self.osc.cleanup_module()
1862 def generate_client_uuid(name):
1863 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1865 int(random.random() * 1048576),
1866 int(random.random() * 1048576))
1867 return client_uuid[:36]
1870 class Mountpoint(Module):
1871 def __init__(self,db):
1872 Module.__init__(self, 'MTPT', db)
1873 self.path = self.db.get_val('path')
1874 self.fs_uuid = self.db.get_first_ref('filesystem')
1875 fs = self.db.lookup(self.fs_uuid)
1876 self.mds_uuid = fs.get_first_ref('mds')
1877 self.obd_uuid = fs.get_first_ref('obd')
1878 self.mgmt_uuid = fs.get_first_ref('mgmt')
1879 obd = self.db.lookup(self.obd_uuid)
1880 client_uuid = generate_client_uuid(self.name)
1881 self.vosc = VOSC(obd, client_uuid, self.name)
1882 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1884 self.add_lustre_module('mdc', 'mdc')
1885 self.add_lustre_module('llite', 'llite')
1887 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1893 if fs_is_mounted(self.path):
1894 log(self.path, "already mounted.")
1898 self.mgmtcli.prepare()
1901 mdc_name = self.mdc.name
1903 self.info(self.path, self.mds_uuid, self.obd_uuid)
1904 if config.record or config.lctl_dump:
1905 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1907 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1908 (self.vosc.get_name(), mdc_name, config.config, self.path)
1909 run("mkdir", self.path)
1914 panic("mount failed:", self.path, ":", string.join(val))
1917 self.info(self.path, self.mds_uuid,self.obd_uuid)
1919 if config.record or config.lctl_dump:
1920 lctl.del_mount_option(local_node_name)
1922 if fs_is_mounted(self.path):
1924 (rc, out) = run("umount", "-f", self.path)
1926 (rc, out) = run("umount", self.path)
1928 raise CommandError('umount', out, rc)
1930 if fs_is_mounted(self.path):
1931 panic("fs is still mounted:", self.path)
1936 self.mgmtcli.cleanup()
1938 def load_module(self):
1940 self.mgmtcli.load_module()
1941 self.vosc.load_module()
1942 Module.load_module(self)
1944 def cleanup_module(self):
1945 Module.cleanup_module(self)
1946 self.vosc.cleanup_module()
1948 self.mgmtcli.cleanup_module()
1951 # ============================================================
1952 # misc query functions
1954 def get_ost_net(self, osd_uuid):
1958 osd = self.lookup(osd_uuid)
1959 node_uuid = osd.get_first_ref('node')
1960 node = self.lookup(node_uuid)
1962 panic("unable to find node for osd_uuid:", osd_uuid,
1963 " node_ref:", node_uuid)
1964 for net_uuid in node.get_networks():
1965 db = node.lookup(net_uuid)
1966 srv_list.append(Network(db))
1970 # the order of iniitailization is based on level.
1971 def getServiceLevel(self):
1972 type = self.get_class()
1974 if type in ('network',):
1976 elif type in ('routetbl',):
1978 elif type in ('ldlm',):
1980 elif type in ('mgmt',):
1982 elif type in ('osd', 'cobd'):
1984 elif type in ('mdsdev',):
1986 elif type in ('mountpoint', 'echoclient'):
1989 panic("Unknown type: ", type)
1991 if ret < config.minlevel or ret > config.maxlevel:
1996 # return list of services in a profile. list is a list of tuples
1997 # [(level, db_object),]
1998 def getServices(self):
2000 for ref_class, ref_uuid in self.get_all_refs():
2001 servdb = self.lookup(ref_uuid)
2003 level = getServiceLevel(servdb)
2005 list.append((level, servdb))
2007 panic('service not found: ' + ref_uuid)
2013 ############################################################
2015 # FIXME: clean this mess up!
2017 # OSC is no longer in the xml, so we have to fake it.
2018 # this is getting ugly and begging for another refactoring
2019 def get_osc(ost_db, uuid, fs_name):
2020 osc = OSC(ost_db, uuid, fs_name)
2023 def get_mdc(db, uuid, fs_name, mds_uuid):
2024 mds_db = db.lookup(mds_uuid);
2026 panic("no mds:", mds_uuid)
2027 mdc = MDC(mds_db, uuid, fs_name)
2030 ############################################################
2031 # routing ("rooting")
2033 # list of (nettype, cluster_id, nid)
2036 def find_local_clusters(node_db):
2037 global local_clusters
2038 for netuuid in node_db.get_networks():
2039 net = node_db.lookup(netuuid)
2041 debug("add_local", netuuid)
2042 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2044 if acceptors.has_key(srv.port):
2045 panic("duplicate port:", srv.port)
2046 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2047 srv.send_mem, srv.recv_mem,
2050 # This node is a gateway.
2052 def node_is_router():
2055 # If there are any routers found in the config, then this will be true
2056 # and all nodes will load kptlrouter.
2058 def node_needs_router():
2059 return needs_router or is_router
2061 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2062 # Currently, these local routes are only added to kptlrouter route
2063 # table if they are needed to connect to a specific server. This
2064 # should be changed so all available routes are loaded, and the
2065 # ptlrouter can make all the decisions.
2068 def find_local_routes(lustre):
2069 """ Scan the lustre config looking for routers . Build list of
2071 global local_routes, needs_router
2073 list = lustre.lookup_class('node')
2075 if router.get_val_int('router', 0):
2077 for (local_type, local_cluster_id, local_nid) in local_clusters:
2079 for netuuid in router.get_networks():
2080 db = router.lookup(netuuid)
2081 if (local_type == db.get_val('nettype') and
2082 local_cluster_id == db.get_val('clusterid')):
2083 gw = db.get_val('nid')
2086 debug("find_local_routes: gw is", gw)
2087 for route in router.get_local_routes(local_type, gw):
2088 local_routes.append(route)
2089 debug("find_local_routes:", local_routes)
2092 def choose_local_server(srv_list):
2093 for srv in srv_list:
2094 if local_cluster(srv.net_type, srv.cluster_id):
2097 def local_cluster(net_type, cluster_id):
2098 for cluster in local_clusters:
2099 if net_type == cluster[0] and cluster_id == cluster[1]:
2103 def local_interface(net_type, cluster_id, nid):
2104 for cluster in local_clusters:
2105 if (net_type == cluster[0] and cluster_id == cluster[1]
2106 and nid == cluster[2]):
2110 def find_route(srv_list):
2112 frm_type = local_clusters[0][0]
2113 for srv in srv_list:
2114 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2115 to_type = srv.net_type
2117 cluster_id = srv.cluster_id
2118 debug ('looking for route to', to_type, to)
2119 for r in local_routes:
2120 debug("find_route: ", r)
2121 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2122 result.append((srv, r))
2125 def get_active_target(db):
2126 target_uuid = db.getUUID()
2127 target_name = db.getName()
2128 node_name = get_select(target_name)
2130 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2132 tgt_dev_uuid = db.get_first_ref('active')
2135 def get_server_by_nid_uuid(db, nid_uuid):
2136 for n in db.lookup_class("network"):
2138 if net.nid_uuid == nid_uuid:
2142 ############################################################
2146 type = db.get_class()
2147 debug('Service:', type, db.getName(), db.getUUID())
2152 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2153 elif type == 'network':
2155 elif type == 'routetbl':
2159 elif type == 'cobd':
2161 elif type == 'mdsdev':
2163 elif type == 'mountpoint':
2165 elif type == 'echoclient':
2167 elif type == 'mgmt':
2170 panic ("unknown service type:", type)
2174 # Prepare the system to run lustre using a particular profile
2175 # in a the configuration.
2176 # * load & the modules
2177 # * setup networking for the current node
2178 # * make sure partitions are in place and prepared
2179 # * initialize devices with lctl
2180 # Levels is important, and needs to be enforced.
2181 def for_each_profile(db, prof_list, operation):
2182 for prof_uuid in prof_list:
2183 prof_db = db.lookup(prof_uuid)
2185 panic("profile:", profile, "not found.")
2186 services = getServices(prof_db)
2189 def doWriteconf(services):
2193 if s[1].get_class() == 'mdsdev':
2194 n = newService(s[1])
2197 def doSetup(services):
2201 n = newService(s[1])
2204 def doModules(services):
2208 n = newService(s[1])
2211 def doCleanup(services):
2216 n = newService(s[1])
2217 if n.safe_to_clean():
2220 def doUnloadModules(services):
2225 n = newService(s[1])
2226 if n.safe_to_clean_modules():
2231 def doHost(lustreDB, hosts):
2232 global is_router, local_node_name
2235 node_db = lustreDB.lookup_name(h, 'node')
2239 panic('No host entry found.')
2241 local_node_name = node_db.get_val('name', 0)
2242 is_router = node_db.get_val_int('router', 0)
2243 lustre_upcall = node_db.get_val('lustreUpcall', '')
2244 portals_upcall = node_db.get_val('portalsUpcall', '')
2245 timeout = node_db.get_val_int('timeout', 0)
2246 ptldebug = node_db.get_val('ptldebug', '')
2247 subsystem = node_db.get_val('subsystem', '')
2249 find_local_clusters(node_db)
2251 find_local_routes(lustreDB)
2253 # Two step process: (1) load modules, (2) setup lustre
2254 # if not cleaning, load modules first.
2255 prof_list = node_db.get_refs('profile')
2257 if config.write_conf:
2258 for_each_profile(node_db, prof_list, doModules)
2260 for_each_profile(node_db, prof_list, doWriteconf)
2261 for_each_profile(node_db, prof_list, doUnloadModules)
2263 elif config.recover:
2264 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2265 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2266 "--client_uuid <UUID> --conn_uuid <UUID>")
2267 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2269 elif config.cleanup:
2271 # the command line can override this value
2273 # ugly hack, only need to run lctl commands for --dump
2274 if config.lctl_dump or config.record:
2275 for_each_profile(node_db, prof_list, doCleanup)
2278 sys_set_timeout(timeout)
2279 sys_set_ptldebug(ptldebug)
2280 sys_set_subsystem(subsystem)
2281 sys_set_lustre_upcall(lustre_upcall)
2282 sys_set_portals_upcall(portals_upcall)
2284 for_each_profile(node_db, prof_list, doCleanup)
2285 for_each_profile(node_db, prof_list, doUnloadModules)
2288 # ugly hack, only need to run lctl commands for --dump
2289 if config.lctl_dump or config.record:
2290 sys_set_timeout(timeout)
2291 sys_set_lustre_upcall(lustre_upcall)
2292 for_each_profile(node_db, prof_list, doSetup)
2296 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2297 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2299 for_each_profile(node_db, prof_list, doModules)
2301 sys_set_debug_path()
2302 sys_set_ptldebug(ptldebug)
2303 sys_set_subsystem(subsystem)
2304 script = config.gdb_script
2305 run(lctl.lctl, ' modules >', script)
2307 log ("The GDB module script is in", script)
2308 # pause, so user has time to break and
2311 sys_set_timeout(timeout)
2312 sys_set_lustre_upcall(lustre_upcall)
2313 sys_set_portals_upcall(portals_upcall)
2315 for_each_profile(node_db, prof_list, doSetup)
2317 def doRecovery(db, lctl, tgt_uuid, client_uuid, nid_uuid):
2318 tgt = db.lookup(tgt_uuid)
2320 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2321 new_uuid = get_active_target(tgt)
2323 raise Lustre.LconfError("doRecovery: no active target found for: " +
2325 net = choose_local_server(get_ost_net(db, new_uuid))
2327 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2329 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2331 oldnet = get_server_by_nid_uuid(db, nid_uuid)
2333 lctl.disconnect(oldnet)
2334 except CommandError, e:
2335 log("recover: disconnect", nid_uuid, "failed: ")
2340 except CommandError, e:
2341 log("recover: connect failed")
2344 lctl.recover(client_uuid, net.nid_uuid)
2347 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2348 base = os.path.dirname(cmd)
2349 if development_mode():
2350 if not config.lustre:
2351 debug('using objdir module paths')
2352 config.lustre = (os.path.join(base, ".."))
2353 # normalize the portals dir, using command line arg if set
2355 portals_dir = config.portals
2356 dir = os.path.join(config.lustre, portals_dir)
2357 config.portals = dir
2358 debug('config.portals', config.portals)
2359 elif config.lustre and config.portals:
2361 # if --lustre and --portals, normalize portals
2362 # can ignore POTRALS_DIR here, since it is probly useless here
2363 config.portals = os.path.join(config.lustre, config.portals)
2364 debug('config.portals B', config.portals)
2366 def sysctl(path, val):
2367 debug("+ sysctl", path, val)
2371 fp = open(os.path.join('/proc/sys', path), 'w')
2378 def sys_set_debug_path():
2379 sysctl('portals/debug_path', config.debug_path)
2381 def sys_set_lustre_upcall(upcall):
2382 # the command overrides the value in the node config
2383 if config.lustre_upcall:
2384 upcall = config.lustre_upcall
2386 upcall = config.upcall
2388 lctl.set_lustre_upcall(upcall)
2390 def sys_set_portals_upcall(upcall):
2391 # the command overrides the value in the node config
2392 if config.portals_upcall:
2393 upcall = config.portals_upcall
2395 upcall = config.upcall
2397 sysctl('portals/upcall', upcall)
2399 def sys_set_timeout(timeout):
2400 # the command overrides the value in the node config
2401 if config.timeout and config.timeout > 0:
2402 timeout = config.timeout
2403 if timeout != None and timeout > 0:
2404 lctl.set_timeout(timeout)
2406 def sys_tweak_socknal ():
2407 if config.single_socket:
2408 sysctl("socknal/typed", 0)
2410 def sys_optimize_elan ():
2411 procfiles = ["/proc/elan/config/eventint_punt_loops",
2412 "/proc/qsnet/elan3/config/eventint_punt_loops",
2413 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2415 if os.access(p, os.R_OK):
2416 run ("echo 0 > " + p)
2418 def sys_set_ptldebug(ptldebug):
2420 ptldebug = config.ptldebug
2423 val = eval(ptldebug, ptldebug_names)
2424 val = "0x%x" % (val)
2425 sysctl('portals/debug', val)
2426 except NameError, e:
2429 def sys_set_subsystem(subsystem):
2430 if config.subsystem:
2431 subsystem = config.subsystem
2434 val = eval(subsystem, subsystem_names)
2435 val = "0x%x" % (val)
2436 sysctl('portals/subsystem_debug', val)
2437 except NameError, e:
2440 def sys_set_netmem_max(path, max):
2441 debug("setting", path, "to at least", max)
2449 fp = open(path, 'w')
2450 fp.write('%d\n' %(max))
2454 def sys_make_devices():
2455 if not os.access('/dev/portals', os.R_OK):
2456 run('mknod /dev/portals c 10 240')
2457 if not os.access('/dev/obd', os.R_OK):
2458 run('mknod /dev/obd c 10 241')
2461 # Add dir to the global PATH, if not already there.
2462 def add_to_path(new_dir):
2463 syspath = string.split(os.environ['PATH'], ':')
2464 if new_dir in syspath:
2466 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2468 def default_debug_path():
2469 path = '/tmp/lustre-log'
2470 if os.path.isdir('/r'):
2475 def default_gdb_script():
2476 script = '/tmp/ogdb'
2477 if os.path.isdir('/r'):
2478 return '/r' + script
2483 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2484 # ensure basic elements are in the system path
2485 def sanitise_path():
2486 for dir in DEFAULT_PATH:
2489 # global hack for the --select handling
2491 def init_select(args):
2492 # args = [service=nodeA,service2=nodeB service3=nodeC]
2495 list = string.split(arg, ',')
2497 srv, node = string.split(entry, '=')
2498 tgt_select[srv] = node
2500 def get_select(srv):
2501 if tgt_select.has_key(srv):
2502 return tgt_select[srv]
2506 FLAG = Lustre.Options.FLAG
2507 PARAM = Lustre.Options.PARAM
2508 INTPARAM = Lustre.Options.INTPARAM
2509 PARAMLIST = Lustre.Options.PARAMLIST
2511 ('verbose,v', "Print system commands as they are run"),
2512 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2513 ('config', "Cluster config name used for LDAP query", PARAM),
2514 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2515 ('node', "Load config for <nodename>", PARAM),
2516 ('cleanup,d', "Cleans up config. (Shutdown)"),
2517 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2519 ('single_socket', "socknal option: only use one socket instead of bundle",
2521 ('failover',"""Used to shut down without saving state.
2522 This will allow this node to "give up" a service to a
2523 another node for failover purposes. This will not
2524 be a clean shutdown.""",
2526 ('gdb', """Prints message after creating gdb module script
2527 and sleeps for 5 seconds."""),
2528 ('noexec,n', """Prints the commands and steps that will be run for a
2529 config without executing them. This can used to check if a
2530 config file is doing what it should be doing"""),
2531 ('nomod', "Skip load/unload module step."),
2532 ('nosetup', "Skip device setup/cleanup step."),
2533 ('reformat', "Reformat all devices (without question)"),
2534 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2535 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2537 ('write_conf', "Save all the client config information on mds."),
2538 ('record', "Write config information on mds."),
2539 ('record_log', "Name of config record log.", PARAM),
2540 ('record_device', "MDS device name that will record the config commands",
2542 ('minlevel', "Minimum level of services to configure/cleanup",
2544 ('maxlevel', """Maximum level of services to configure/cleanup
2545 Levels are aproximatly like:
2550 70 - mountpoint, echo_client, osc, mdc, lov""",
2552 ('lustre', """Base directory of lustre sources. This parameter will
2553 cause lconf to load modules from a source tree.""", PARAM),
2554 ('portals', """Portals source directory. If this is a relative path,
2555 then it is assumed to be relative to lustre. """, PARAM),
2556 ('timeout', "Set recovery timeout", INTPARAM),
2557 ('upcall', "Set both portals and lustre upcall script", PARAM),
2558 ('lustre_upcall', "Set lustre upcall script", PARAM),
2559 ('portals_upcall', "Set portals upcall script", PARAM),
2560 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2561 ('ptldebug', "Set the portals debug level", PARAM),
2562 ('subsystem', "Set the portals debug subsystem", PARAM),
2563 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2564 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2565 # Client recovery options
2566 ('recover', "Recover a device"),
2567 ('group', "The group of devices to configure or cleanup", PARAM),
2568 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2569 ('client_uuid', "The failed client (required for recovery)", PARAM),
2570 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2572 ('inactive', """The name of an inactive service, to be ignored during
2573 mounting (currently OST-only). Can be repeated.""",
2578 global lctl, config, toplevel, CONFIG_FILE
2580 # in the upcall this is set to SIG_IGN
2581 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2583 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2585 config, args = cl.parse(sys.argv[1:])
2586 except Lustre.OptionError, e:
2590 setupModulePath(sys.argv[0])
2592 host = socket.gethostname()
2594 # the PRNG is normally seeded with time(), which is not so good for starting
2595 # time-synchronized clusters
2596 input = open('/dev/urandom', 'r')
2598 print 'Unable to open /dev/urandom!'
2600 seed = input.read(32)
2606 init_select(config.select)
2609 # allow config to be fetched via HTTP, but only with python2
2610 if sys.version[0] != '1' and args[0].startswith('http://'):
2613 config_file = urllib2.urlopen(args[0])
2614 except (urllib2.URLError, socket.error), err:
2615 if hasattr(err, 'args'):
2617 print "Could not access '%s': %s" %(args[0], err)
2619 elif not os.access(args[0], os.R_OK):
2620 print 'File not found or readable:', args[0]
2624 config_file = open(args[0], 'r')
2626 dom = xml.dom.minidom.parse(config_file)
2628 panic("%s does not appear to be a config file." % (args[0]))
2629 sys.exit(1) # make sure to die here, even in debug mode.
2630 CONFIG_FILE = args[0]
2631 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2632 if not config.config:
2633 config.config = os.path.basename(args[0])# use full path?
2634 if config.config[-4:] == '.xml':
2635 config.config = config.config[:-4]
2636 elif config.ldapurl:
2637 if not config.config:
2638 panic("--ldapurl requires --config name")
2639 dn = "config=%s,fs=lustre" % (config.config)
2640 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2641 elif config.ptldebug or config.subsystem:
2642 sys_set_ptldebug(None)
2643 sys_set_subsystem(None)
2646 print 'Missing config file or ldap URL.'
2647 print 'see lconf --help for command summary'
2652 ver = db.get_version()
2654 panic("No version found in config data, please recreate.")
2655 if ver != Lustre.CONFIG_VERSION:
2656 panic("Config version", ver, "does not match lconf version",
2657 Lustre.CONFIG_VERSION)
2661 node_list.append(config.node)
2664 node_list.append(host)
2665 node_list.append('localhost')
2667 debug("configuring for host: ", node_list)
2670 config.debug_path = config.debug_path + '-' + host
2671 config.gdb_script = config.gdb_script + '-' + host
2673 lctl = LCTLInterface('lctl')
2675 if config.lctl_dump:
2676 lctl.use_save_file(config.lctl_dump)
2679 if not (config.record_device and config.record_log):
2680 panic("When recording, both --record_log and --record_device must be specified.")
2681 lctl.clear_log(config.record_device, config.record_log)
2682 lctl.record(config.record_device, config.record_log)
2684 doHost(db, node_list)
2689 if __name__ == "__main__":
2692 except Lustre.LconfError, e:
2694 # traceback.print_exc(file=sys.stdout)
2696 except CommandError, e:
2700 if first_cleanup_error:
2701 sys.exit(first_cleanup_error)