3 # Copyright (C) 2002-2003 Cluster File Systems, Inc.
4 # Authors: Robert Read <rread@clusterfs.com>
5 # Mike Shaver <shaver@clusterfs.com>
6 # This file is part of Lustre, http://www.lustre.org.
8 # Lustre is free software; you can redistribute it and/or
9 # modify it under the terms of version 2 of the GNU General Public
10 # License as published by the Free Software Foundation.
12 # Lustre is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU General Public License for more details.
17 # You should have received a copy of the GNU General Public License
18 # along with Lustre; if not, write to the Free Software
19 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 # lconf - lustre configuration tool
23 # lconf is the main driver script for starting and stopping
24 # lustre filesystem services.
26 # Based in part on the XML obdctl modifications done by Brian Behlendorf
28 import sys, getopt, types
29 import string, os, stat, popen2, socket, time, random, fcntl, select
30 import re, exceptions, signal, traceback
31 import xml.dom.minidom
33 if sys.version[0] == '1':
34 from FCNTL import F_GETFL, F_SETFL
36 from fcntl import F_GETFL, F_SETFL
38 PYMOD_DIR = "/usr/lib/lustre/python"
40 def development_mode():
41 base = os.path.dirname(sys.argv[0])
42 if os.access(base+"/Makefile", os.R_OK):
46 if development_mode():
47 sys.path.append('../utils')
49 sys.path.append(PYMOD_DIR)
55 DEFAULT_TCPBUF = 8388608
58 # Maximum number of devices to search for.
59 # (the /dev/loop* nodes need to be created beforehand)
60 MAX_LOOP_DEVICES = 256
61 PORTALS_DIR = 'portals'
63 # Needed to call lconf --record
66 # Please keep these in sync with the values in portals/kp30.h
78 "warning" : (1 << 10),
82 "portals" : (1 << 14),
84 "dlmtrace" : (1 << 16),
88 "rpctrace" : (1 << 20),
89 "vfstrace" : (1 << 21),
94 "undefined" : (1 << 0),
104 "portals" : (1 << 10),
105 "socknal" : (1 << 11),
106 "qswnal" : (1 << 12),
107 "pinger" : (1 << 13),
108 "filter" : (1 << 14),
114 "ptlrouter" : (1 << 20),
120 first_cleanup_error = 0
121 def cleanup_error(rc):
122 global first_cleanup_error
123 if not first_cleanup_error:
124 first_cleanup_error = rc
126 # ============================================================
127 # debugging and error funcs
129 def fixme(msg = "this feature"):
130 raise Lustre.LconfError, msg + ' not implmemented yet.'
133 msg = string.join(map(str,args))
134 if not config.noexec:
135 raise Lustre.LconfError(msg)
140 msg = string.join(map(str,args))
145 print string.strip(s)
149 msg = string.join(map(str,args))
152 # ack, python's builtin int() does not support '0x123' syntax.
153 # eval can do it, although what a hack!
157 return eval(s, {}, {})
160 except SyntaxError, e:
161 raise ValueError("not a number")
163 raise ValueError("not a number")
165 # ============================================================
166 # locally defined exceptions
167 class CommandError (exceptions.Exception):
168 def __init__(self, cmd_name, cmd_err, rc=None):
169 self.cmd_name = cmd_name
170 self.cmd_err = cmd_err
175 if type(self.cmd_err) == types.StringType:
177 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
179 print "! %s: %s" % (self.cmd_name, self.cmd_err)
180 elif type(self.cmd_err) == types.ListType:
182 print "! %s (error %d):" % (self.cmd_name, self.rc)
184 print "! %s:" % (self.cmd_name)
185 for s in self.cmd_err:
186 print "> %s" %(string.strip(s))
191 # ============================================================
192 # handle daemons, like the acceptor
194 """ Manage starting and stopping a daemon. Assumes daemon manages
195 it's own pid file. """
197 def __init__(self, cmd):
203 log(self.command, "already running.")
205 self.path = find_prog(self.command)
207 panic(self.command, "not found.")
208 ret, out = runcmd(self.path +' '+ self.command_line())
210 raise CommandError(self.path, out, ret)
214 pid = self.read_pidfile()
216 log ("killing process", pid)
218 #time.sleep(1) # let daemon die
220 log("unable to kill", self.command, e)
222 log("unable to kill", self.command)
225 pid = self.read_pidfile()
235 def read_pidfile(self):
237 fp = open(self.pidfile(), 'r')
244 def clean_pidfile(self):
245 """ Remove a stale pidfile """
246 log("removing stale pidfile:", self.pidfile())
248 os.unlink(self.pidfile())
250 log(self.pidfile(), e)
252 class AcceptorHandler(DaemonHandler):
253 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff):
254 DaemonHandler.__init__(self, "acceptor")
257 self.send_mem = send_mem
258 self.recv_mem = recv_mem
261 self.flags = self.flags + ' -i'
264 return "/var/run/%s-%d.pid" % (self.command, self.port)
266 def command_line(self):
267 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
271 # start the acceptors
273 if config.lctl_dump or config.record:
275 for port in acceptors.keys():
276 daemon = acceptors[port]
277 if not daemon.running():
280 def run_one_acceptor(port):
281 if config.lctl_dump or config.record:
283 if acceptors.has_key(port):
284 daemon = acceptors[port]
285 if not daemon.running():
288 panic("run_one_acceptor: No acceptor defined for port:", port)
290 def stop_acceptor(port):
291 if acceptors.has_key(port):
292 daemon = acceptors[port]
297 # ============================================================
298 # handle lctl interface
301 Manage communication with lctl
304 def __init__(self, cmd):
306 Initialize close by finding the lctl binary.
308 self.lctl = find_prog(cmd)
310 self.record_device = ''
313 debug('! lctl not found')
316 raise CommandError('lctl', "unable to find lctl binary.")
318 def use_save_file(self, file):
319 self.save_file = file
321 def record(self, dev_name, logname):
322 log("Recording log", logname, "on", dev_name)
323 self.record_device = dev_name
324 self.record_log = logname
326 def end_record(self):
327 log("End recording log", self.record_log, "on", self.record_device)
328 self.record_device = None
329 self.record_log = None
331 def set_nonblock(self, fd):
332 fl = fcntl.fcntl(fd, F_GETFL)
333 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
338 the cmds are written to stdin of lctl
339 lctl doesn't return errors when run in script mode, so
341 should modify command line to accept multiple commands, or
342 create complex command line options
346 cmds = '\n dump ' + self.save_file + '\n' + cmds
347 elif self.record_device:
351 %s""" % (self.record_device, self.record_log, cmds)
353 debug("+", cmd_line, cmds)
354 if config.noexec: return (0, [])
356 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
357 child.tochild.write(cmds + "\n")
358 child.tochild.close()
360 # From "Python Cookbook" from O'Reilly
361 outfile = child.fromchild
362 outfd = outfile.fileno()
363 self.set_nonblock(outfd)
364 errfile = child.childerr
365 errfd = errfile.fileno()
366 self.set_nonblock(errfd)
368 outdata = errdata = ''
371 ready = select.select([outfd,errfd],[],[]) # Wait for input
372 if outfd in ready[0]:
373 outchunk = outfile.read()
374 if outchunk == '': outeof = 1
375 outdata = outdata + outchunk
376 if errfd in ready[0]:
377 errchunk = errfile.read()
378 if errchunk == '': erreof = 1
379 errdata = errdata + errchunk
380 if outeof and erreof: break
381 # end of "borrowed" code
384 if os.WIFEXITED(ret):
385 rc = os.WEXITSTATUS(ret)
388 if rc or len(errdata):
389 raise CommandError(self.lctl, errdata, rc)
392 def runcmd(self, *args):
394 run lctl using the command line
396 cmd = string.join(map(str,args))
397 debug("+", self.lctl, cmd)
398 rc, out = run(self.lctl, cmd)
400 raise CommandError(self.lctl, out, rc)
404 def clear_log(self, dev, log):
405 """ clear an existing log """
410 quit """ % (dev, log)
413 def network(self, net, nid):
418 quit """ % (net, nid)
421 # create a new connection
422 def add_uuid(self, net_type, uuid, nid):
423 cmds = "\n add_uuid %s %s %s" %(uuid, nid, net_type)
426 def add_autoconn(self, net_type, send_mem, recv_mem, nid, hostaddr,
428 if net_type in ('tcp',) and not config.lctl_dump:
433 add_autoconn %s %s %d %s
437 nid, hostaddr, port, flags )
440 def connect(self, srv):
441 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
442 if srv.net_type in ('tcp',) and not config.lctl_dump:
446 self.add_autoconn(srv.net_type, srv.send_mem, srv.recv_mem,
447 srv.nid, srv.hostaddr, srv.port, flags)
450 def recover(self, dev_name, new_conn):
453 recover %s""" %(dev_name, new_conn)
456 # add a route to a range
457 def add_route(self, net, gw, lo, hi):
465 except CommandError, e:
469 def del_route(self, net, gw, lo, hi):
474 quit """ % (net, gw, lo, hi)
477 # add a route to a host
478 def add_route_host(self, net, uuid, gw, tgt):
479 self.add_uuid(net, uuid, tgt)
487 except CommandError, e:
491 # add a route to a range
492 def del_route_host(self, net, uuid, gw, tgt):
498 quit """ % (net, gw, tgt)
502 def del_autoconn(self, net_type, nid, hostaddr):
503 if net_type in ('tcp',) and not config.lctl_dump:
512 # disconnect one connection
513 def disconnect(self, srv):
514 self.del_uuid(srv.nid_uuid)
515 if srv.net_type in ('tcp',) and not config.lctl_dump:
516 self.del_autoconn(srv.net_type, srv.nid, srv.hostaddr)
518 def del_uuid(self, uuid):
526 def disconnectAll(self, net):
534 def attach(self, type, name, uuid):
537 quit""" % (type, name, uuid)
540 def setup(self, name, setup = ""):
544 quit""" % (name, setup)
548 # create a new device with lctl
549 def newdev(self, type, name, uuid, setup = ""):
550 self.attach(type, name, uuid);
552 self.setup(name, setup)
553 except CommandError, e:
554 self.cleanup(name, uuid, 0)
559 def cleanup(self, name, uuid, force, failover = 0):
560 if failover: force = 1
566 quit""" % (name, ('', 'force')[force],
567 ('', 'failover')[failover])
571 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
572 stripe_sz, stripe_off,
576 lov_setup %s %d %d %d %s %s
577 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off,
582 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
586 lov_setconfig %s %d %d %d %s %s
587 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
591 def dump(self, dump_file):
594 quit""" % (dump_file)
597 # get list of devices
598 def device_list(self):
599 devices = '/proc/fs/lustre/devices'
601 if os.access(devices, os.R_OK):
603 fp = open(devices, 'r')
611 def lustre_version(self):
612 rc, out = self.runcmd('version')
616 def mount_option(self, profile, osc, mdc):
618 mount_option %s %s %s
619 quit""" % (profile, osc, mdc)
622 # delete mount options
623 def del_mount_option(self, profile):
629 def set_timeout(self, timeout):
635 # delete mount options
636 def set_lustre_upcall(self, upcall):
641 # ============================================================
642 # Various system-level functions
643 # (ideally moved to their own module)
645 # Run a command and return the output and status.
646 # stderr is sent to /dev/null, could use popen3 to
647 # save it if necessary
650 if config.noexec: return (0, [])
651 f = os.popen(cmd + ' 2>&1')
661 cmd = string.join(map(str,args))
664 # Run a command in the background.
665 def run_daemon(*args):
666 cmd = string.join(map(str,args))
668 if config.noexec: return 0
669 f = os.popen(cmd + ' 2>&1')
677 # Determine full path to use for an external command
678 # searches dirname(argv[0]) first, then PATH
680 syspath = string.split(os.environ['PATH'], ':')
681 cmdpath = os.path.dirname(sys.argv[0])
682 syspath.insert(0, cmdpath);
684 syspath.insert(0, os.path.join(config.portals, 'utils/'))
686 prog = os.path.join(d,cmd)
687 if os.access(prog, os.X_OK):
691 # Recursively look for file starting at base dir
692 def do_find_file(base, mod):
693 fullname = os.path.join(base, mod)
694 if os.access(fullname, os.R_OK):
696 for d in os.listdir(base):
697 dir = os.path.join(base,d)
698 if os.path.isdir(dir):
699 module = do_find_file(dir, mod)
703 def find_module(src_dir, dev_dir, modname):
704 modbase = src_dir +'/'+ dev_dir +'/'+ modname
705 for modext in '.ko', '.o':
706 module = modbase + modext
708 if os.access(module, os.R_OK):
714 # is the path a block device?
721 return stat.S_ISBLK(s[stat.ST_MODE])
723 # build fs according to type
725 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
731 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
733 # devsize is in 1k, and fs block count is in 4k
734 block_cnt = devsize/4
736 if fstype in ('ext3', 'extN', 'ldiskfs'):
737 # ext3 journal size is in megabytes
740 if not is_block(dev):
741 ret, out = runcmd("ls -l %s" %dev)
742 devsize = int(string.split(out[0])[4]) / 1024
744 ret, out = runcmd("sfdisk -s %s" %dev)
745 devsize = int(out[0])
746 if devsize > 1024 * 1024:
747 jsize = ((devsize / 102400) * 4)
750 if jsize: jopt = "-J size=%d" %(jsize,)
751 if isize: iopt = "-I %d" %(isize,)
752 mkfs = 'mkfs.ext2 -j -b 4096 '
753 if not isblock or config.force:
755 elif fstype == 'reiserfs':
756 # reiserfs journal size is in blocks
757 if jsize: jopt = "--journal_size %d" %(jsize,)
758 mkfs = 'mkreiserfs -ff'
760 panic('unsupported fs type: ', fstype)
762 if config.mkfsoptions != None:
763 mkfs = mkfs + ' ' + config.mkfsoptions
764 if mkfsoptions != None:
765 mkfs = mkfs + ' ' + mkfsoptions
766 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
768 panic("Unable to build fs:", dev, string.join(out))
769 # enable hash tree indexing on fsswe
770 if fstype in ('ext3', 'extN', 'ldiskfs'):
771 htree = 'echo "feature FEATURE_C5" | debugfs -w'
772 (ret, out) = run (htree, dev)
774 panic("Unable to enable htree:", dev)
776 # some systems use /dev/loopN, some /dev/loop/N
780 if not os.access(loop + str(0), os.R_OK):
782 if not os.access(loop + str(0), os.R_OK):
783 panic ("can't access loop devices")
786 # find loop device assigned to thefile
789 for n in xrange(0, MAX_LOOP_DEVICES):
791 if os.access(dev, os.R_OK):
792 (stat, out) = run('losetup', dev)
793 if out and stat == 0:
794 m = re.search(r'\((.*)\)', out[0])
795 if m and file == m.group(1):
801 # create file if necessary and assign the first free loop device
802 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
803 dev = find_loop(file)
805 print 'WARNING file:', file, 'already mapped to', dev
807 if reformat or not os.access(file, os.R_OK | os.W_OK):
809 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
810 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
813 panic("Unable to create backing store:", file)
814 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
817 # find next free loop
818 for n in xrange(0, MAX_LOOP_DEVICES):
820 if os.access(dev, os.R_OK):
821 (stat, out) = run('losetup', dev)
823 run('losetup', dev, file)
826 print "out of loop devices"
828 print "out of loop devices"
831 # undo loop assignment
832 def clean_loop(file):
833 dev = find_loop(file)
835 ret, out = run('losetup -d', dev)
837 log('unable to clean loop device:', dev, 'for file:', file)
840 # determine if dev is formatted as a <fstype> filesystem
841 def need_format(fstype, dev):
842 # FIXME don't know how to implement this
845 # initialize a block device if needed
846 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
847 inode_size, mkfsoptions):
848 if config.noexec: return dev
849 if not is_block(dev):
850 dev = init_loop(dev, size, fstype, journal_size, inode_size,
851 mkfsoptions, reformat)
852 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
853 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
856 # panic("device:", dev,
857 # "not prepared, and autoformat is not set.\n",
858 # "Rerun with --reformat option to format ALL filesystems")
863 """lookup IP address for an interface"""
864 rc, out = run("/sbin/ifconfig", iface)
867 addr = string.split(out[1])[1]
868 ip = string.split(addr, ':')[1]
871 def sys_get_elan_position_file():
872 procfiles = ["/proc/elan/device0/position",
873 "/proc/qsnet/elan4/device0/position",
874 "/proc/qsnet/elan3/device0/position"]
876 if os.access(p, os.R_OK):
880 def sys_get_local_nid(net_type, wildcard, cluster_id):
881 """Return the local nid."""
883 if sys_get_elan_position_file():
884 local = sys_get_local_address('elan', '*', cluster_id)
886 local = sys_get_local_address(net_type, wildcard, cluster_id)
889 def sys_get_local_address(net_type, wildcard, cluster_id):
890 """Return the local address for the network type."""
892 if net_type in ('tcp',):
894 iface, star = string.split(wildcard, ':')
895 local = if2addr(iface)
897 panic ("unable to determine ip for:", wildcard)
899 host = socket.gethostname()
900 local = socket.gethostbyname(host)
901 elif net_type == 'elan':
902 # awk '/NodeId/ { print $2 }' 'sys_get_elan_position_file()'
903 f = sys_get_elan_position_file()
905 panic ("unable to determine local Elan ID")
908 lines = fp.readlines()
916 nid = my_int(cluster_id) + my_int(elan_id)
918 except ValueError, e:
922 elif net_type == 'gm':
923 fixme("automatic local address for GM")
927 def mod_loaded(modname):
928 """Check if a module is already loaded. Look in /proc/modules for it."""
930 fp = open('/proc/modules')
931 lines = fp.readlines()
933 # please forgive my tired fingers for this one
934 ret = filter(lambda word, mod=modname: word == mod,
935 map(lambda line: string.split(line)[0], lines))
940 # XXX: instead of device_list, ask for $name and see what we get
941 def is_prepared(name):
942 """Return true if a device exists for the name"""
945 if (config.noexec or config.record) and config.cleanup:
948 # expect this format:
949 # 1 UP ldlm ldlm ldlm_UUID 2
950 out = lctl.device_list()
952 if name == string.split(s)[3]:
954 except CommandError, e:
958 def is_network_prepared():
959 """If the any device exists, then assume that all networking
960 has been configured"""
961 out = lctl.device_list()
964 def fs_is_mounted(path):
965 """Return true if path is a mounted lustre filesystem"""
967 fp = open('/proc/mounts')
968 lines = fp.readlines()
972 if a[1] == path and a[2] == 'lustre_lite':
980 """Manage kernel modules"""
981 def __init__(self, lustre_dir, portals_dir):
982 self.lustre_dir = lustre_dir
983 self.portals_dir = portals_dir
984 self.kmodule_list = []
986 def add_portals_module(self, dev_dir, modname):
987 """Append a module to list of modules to load."""
988 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
990 def add_lustre_module(self, dev_dir, modname):
991 """Append a module to list of modules to load."""
992 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
994 def load_module(self):
995 """Load all the modules in the list in the order they appear."""
996 for src_dir, dev_dir, mod in self.kmodule_list:
997 if mod_loaded(mod) and not config.noexec:
999 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
1001 module = find_module(src_dir, dev_dir, mod)
1003 panic('module not found:', mod)
1004 (rc, out) = run('/sbin/insmod', module)
1006 raise CommandError('insmod', out, rc)
1008 (rc, out) = run('/sbin/modprobe', mod)
1010 raise CommandError('modprobe', out, rc)
1012 def cleanup_module(self):
1013 """Unload the modules in the list in reverse order."""
1014 rev = self.kmodule_list
1016 for src_dir, dev_dir, mod in rev:
1017 if not mod_loaded(mod) and not config.noexec:
1020 if mod == 'portals' and config.dump:
1021 lctl.dump(config.dump)
1022 log('unloading module:', mod)
1023 (rc, out) = run('/sbin/rmmod', mod)
1025 log('! unable to unload module:', mod)
1028 # ============================================================
1029 # Classes to prepare and cleanup the various objects
1032 """ Base class for the rest of the modules. The default cleanup method is
1033 defined here, as well as some utilitiy funcs.
1035 def __init__(self, module_name, db):
1037 self.module_name = module_name
1038 self.name = self.db.getName()
1039 self.uuid = self.db.getUUID()
1042 self.kmod = kmod(config.lustre, config.portals)
1044 def info(self, *args):
1045 msg = string.join(map(str,args))
1046 print self.module_name + ":", self.name, self.uuid, msg
1049 """ default cleanup, used for most modules """
1052 lctl.cleanup(self.name, self.uuid, config.force)
1053 except CommandError, e:
1054 log(self.module_name, "cleanup failed: ", self.name)
1058 def add_portals_module(self, dev_dir, modname):
1059 """Append a module to list of modules to load."""
1060 self.kmod.add_portals_module(dev_dir, modname)
1062 def add_lustre_module(self, dev_dir, modname):
1063 """Append a module to list of modules to load."""
1064 self.kmod.add_lustre_module(dev_dir, modname)
1066 def load_module(self):
1067 """Load all the modules in the list in the order they appear."""
1068 self.kmod.load_module()
1070 def cleanup_module(self):
1071 """Unload the modules in the list in reverse order."""
1072 if self.safe_to_clean():
1073 self.kmod.cleanup_module()
1075 def safe_to_clean(self):
1078 def safe_to_clean_modules(self):
1079 return self.safe_to_clean()
1081 class Network(Module):
1082 def __init__(self,db):
1083 Module.__init__(self, 'NETWORK', db)
1084 self.net_type = self.db.get_val('nettype')
1085 self.nid = self.db.get_val('nid', '*')
1086 self.cluster_id = self.db.get_val('clusterid', "0")
1087 self.port = self.db.get_val_int('port', 0)
1088 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
1089 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
1090 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
1093 self.nid = sys_get_local_nid(self.net_type, self.nid, self.cluster_id)
1095 panic("unable to set nid for", self.net_type, self.nid, cluster_id)
1096 self.generic_nid = 1
1097 debug("nid:", self.nid)
1099 self.generic_nid = 0
1101 self.nid_uuid = self.nid_to_uuid(self.nid)
1103 self.hostaddr = self.db.get_val('hostaddr', self.nid)
1104 if '*' in self.hostaddr:
1105 self.hostaddr = sys_get_local_address(self.net_type, self.hostaddr, self.cluster_id)
1106 if not self.hostaddr:
1107 panic("unable to set hostaddr for", self.net_type, self.hostaddr, self.cluster_id)
1108 debug("hostaddr:", self.hostaddr)
1110 self.add_portals_module("libcfs", 'portals')
1111 if node_needs_router():
1112 self.add_portals_module("router", 'kptlrouter')
1113 if self.net_type == 'tcp':
1114 self.add_portals_module("knals/socknal", 'ksocknal')
1115 if self.net_type == 'elan':
1116 self.add_portals_module("knals/qswnal", 'kqswnal')
1117 if self.net_type == 'gm':
1118 self.add_portals_module("knals/gmnal", 'kgmnal')
1120 def nid_to_uuid(self, nid):
1121 return "NID_%s_UUID" %(nid,)
1124 if is_network_prepared():
1126 self.info(self.net_type, self.nid, self.port)
1127 if not (config.record and self.generic_nid):
1128 lctl.network(self.net_type, self.nid)
1129 if self.net_type == 'tcp':
1131 if self.net_type == 'elan':
1133 if self.port and node_is_router():
1134 run_one_acceptor(self.port)
1135 self.connect_peer_gateways()
1137 def connect_peer_gateways(self):
1138 for router in self.db.lookup_class('node'):
1139 if router.get_val_int('router', 0):
1140 for netuuid in router.get_networks():
1141 net = self.db.lookup(netuuid)
1143 if (gw.cluster_id == self.cluster_id and
1144 gw.net_type == self.net_type):
1145 if gw.nid != self.nid:
1148 def disconnect_peer_gateways(self):
1149 for router in self.db.lookup_class('node'):
1150 if router.get_val_int('router', 0):
1151 for netuuid in router.get_networks():
1152 net = self.db.lookup(netuuid)
1154 if (gw.cluster_id == self.cluster_id and
1155 gw.net_type == self.net_type):
1156 if gw.nid != self.nid:
1159 except CommandError, e:
1160 print "disconnect failed: ", self.name
1164 def safe_to_clean(self):
1165 return not is_network_prepared()
1168 self.info(self.net_type, self.nid, self.port)
1170 stop_acceptor(self.port)
1171 if node_is_router():
1172 self.disconnect_peer_gateways()
1174 class RouteTable(Module):
1175 def __init__(self,db):
1176 Module.__init__(self, 'ROUTES', db)
1178 def server_for_route(self, net_type, gw, gw_cluster_id, tgt_cluster_id,
1180 # only setup connections for tcp NALs
1182 if not net_type in ('tcp',):
1185 # connect to target if route is to single node and this node is the gw
1186 if lo == hi and local_interface(net_type, gw_cluster_id, gw):
1187 if not local_cluster(net_type, tgt_cluster_id):
1188 panic("target", lo, " not on the local cluster")
1189 srvdb = self.db.nid2server(lo, net_type, gw_cluster_id)
1190 # connect to gateway if this node is not the gw
1191 elif (local_cluster(net_type, gw_cluster_id)
1192 and not local_interface(net_type, gw_cluster_id, gw)):
1193 srvdb = self.db.nid2server(gw, net_type, gw_cluster_id)
1198 panic("no server for nid", lo)
1201 return Network(srvdb)
1204 if is_network_prepared():
1207 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1208 lctl.add_route(net_type, gw, lo, hi)
1209 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1213 def safe_to_clean(self):
1214 return not is_network_prepared()
1217 if is_network_prepared():
1218 # the network is still being used, don't clean it up
1220 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1221 srv = self.server_for_route(net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi)
1224 lctl.disconnect(srv)
1225 except CommandError, e:
1226 print "disconnect failed: ", self.name
1231 lctl.del_route(net_type, gw, lo, hi)
1232 except CommandError, e:
1233 print "del_route failed: ", self.name
1237 class Management(Module):
1238 def __init__(self, db):
1239 Module.__init__(self, 'MGMT', db)
1240 self.add_lustre_module('lvfs', 'lvfs')
1241 self.add_lustre_module('obdclass', 'obdclass')
1242 self.add_lustre_module('ptlrpc', 'ptlrpc')
1243 self.add_lustre_module('mgmt', 'mgmt_svc')
1246 if is_prepared(self.name):
1249 lctl.newdev("mgmt", self.name, self.uuid)
1251 def safe_to_clean(self):
1255 if is_prepared(self.name):
1256 Module.cleanup(self)
1258 # This is only needed to load the modules; the LDLM device
1259 # is now created automatically.
1261 def __init__(self,db):
1262 Module.__init__(self, 'LDLM', db)
1263 self.add_lustre_module('lvfs', 'lvfs')
1264 self.add_lustre_module('obdclass', 'obdclass')
1265 self.add_lustre_module('ptlrpc', 'ptlrpc')
1274 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1275 Module.__init__(self, 'LOV', db)
1276 if name_override != None:
1277 self.name = "lov_%s" % name_override
1278 self.add_lustre_module('lov', 'lov')
1279 self.mds_uuid = self.db.get_first_ref('mds')
1280 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1281 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1282 self.pattern = self.db.get_val_int('stripepattern', 0)
1283 self.devlist = self.db.get_refs('obd')
1284 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1286 self.desc_uuid = self.uuid
1287 self.uuid = generate_client_uuid(self.name)
1288 self.fs_name = fs_name
1290 self.config_only = 1
1292 self.config_only = None
1293 mds= self.db.lookup(self.mds_uuid)
1294 self.mds_name = mds.getName()
1295 for obd_uuid in self.devlist:
1296 obd = self.db.lookup(obd_uuid)
1297 osc = get_osc(obd, self.uuid, fs_name)
1299 self.osclist.append(osc)
1301 panic('osc not found:', obd_uuid)
1304 if is_prepared(self.name):
1306 if self.config_only:
1307 panic("Can't prepare config_only LOV ", self.name)
1309 for osc in self.osclist:
1311 # Only ignore connect failures with --force, which
1312 # isn't implemented here yet.
1313 osc.prepare(ignore_connect_failure=0)
1314 except CommandError, e:
1315 print "Error preparing OSC %s\n" % osc.uuid
1317 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1318 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1319 lctl.lov_setup(self.name, self.uuid,
1320 self.desc_uuid, self.mds_name, self.stripe_cnt,
1321 self.stripe_sz, self.stripe_off, self.pattern,
1322 string.join(self.devlist))
1325 if is_prepared(self.name):
1326 Module.cleanup(self)
1327 if self.config_only:
1328 panic("Can't clean up config_only LOV ", self.name)
1329 for osc in self.osclist:
1332 def load_module(self):
1333 if self.config_only:
1334 panic("Can't load modules for config_only LOV ", self.name)
1335 for osc in self.osclist:
1338 Module.load_module(self)
1340 def cleanup_module(self):
1341 if self.config_only:
1342 panic("Can't cleanup modules for config_only LOV ", self.name)
1343 Module.cleanup_module(self)
1344 for osc in self.osclist:
1345 osc.cleanup_module()
1348 class MDSDEV(Module):
1349 def __init__(self,db):
1350 Module.__init__(self, 'MDSDEV', db)
1351 self.devpath = self.db.get_val('devpath','')
1352 self.size = self.db.get_val_int('devsize', 0)
1353 self.journal_size = self.db.get_val_int('journalsize', 0)
1354 self.fstype = self.db.get_val('fstype', '')
1355 self.nspath = self.db.get_val('nspath', '')
1356 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1357 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1358 target_uuid = self.db.get_first_ref('target')
1359 mds = self.db.lookup(target_uuid)
1360 self.name = mds.getName()
1361 self.filesystem_uuids = mds.get_refs('filesystem')
1362 # FIXME: if fstype not set, then determine based on kernel version
1363 self.format = self.db.get_val('autoformat', "no")
1364 if mds.get_val('failover', 0):
1365 self.failover_mds = 'f'
1367 self.failover_mds = 'n'
1368 active_uuid = get_active_target(mds)
1370 panic("No target device found:", target_uuid)
1371 if active_uuid == self.uuid:
1375 if self.active and config.group and config.group != mds.get_val('group'):
1378 self.inode_size = self.db.get_val_int('inodesize', 0)
1379 if self.inode_size == 0:
1380 # find the LOV for this MDS
1381 lovconfig_uuid = mds.get_first_ref('lovconfig')
1382 if not lovconfig_uuid:
1383 panic("No LOV config found for MDS ", mds.name)
1384 lovconfig = mds.lookup(lovconfig_uuid)
1385 lov_uuid = lovconfig.get_first_ref('lov')
1387 panic("No LOV found for lovconfig ", lovconfig.name)
1388 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1390 # default stripe count controls default inode_size
1391 if (lov.stripe_cnt > 0):
1392 stripe_count = lov.stripe_cnt
1394 stripe_count = len(lov.devlist)
1395 if stripe_count > 77:
1396 self.inode_size = 4096
1397 elif stripe_count > 35:
1398 self.inode_size = 2048
1399 elif stripe_count > 13:
1400 self.inode_size = 1024
1401 elif stripe_count > 3:
1402 self.inode_size = 512
1404 self.inode_size = 256
1406 self.target_dev_uuid = self.uuid
1407 self.uuid = target_uuid
1410 self.add_lustre_module('mdc', 'mdc')
1411 self.add_lustre_module('osc', 'osc')
1412 self.add_lustre_module('lov', 'lov')
1413 self.add_lustre_module('mds', 'mds')
1414 if self.fstype == 'ldiskfs':
1415 self.add_lustre_module('ldiskfs', 'ldiskfs')
1417 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1419 def load_module(self):
1421 Module.load_module(self)
1424 if is_prepared(self.name):
1427 debug(self.uuid, "not active")
1430 # run write_conf automatically, if --reformat used
1432 self.info(self.devpath, self.fstype, self.size, self.format)
1434 # never reformat here
1435 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1436 self.format, self.journal_size, self.inode_size,
1438 if not is_prepared('MDT'):
1439 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1441 lctl.newdev("mds", self.name, self.uuid,
1442 setup ="%s %s %s" %(blkdev, self.fstype, self.name))
1443 except CommandError, e:
1445 panic("MDS is missing the config log. Need to run " +
1446 "lconf --write_conf.")
1450 def write_conf(self):
1451 if is_prepared(self.name):
1453 self.info(self.devpath, self.fstype, self.format)
1454 blkdev = block_dev(self.devpath, self.size, self.fstype,
1455 config.reformat, self.format, self.journal_size,
1456 self.inode_size, self.mkfsoptions)
1457 lctl.newdev("mds", self.name, self.uuid,
1458 setup ="%s %s" %(blkdev, self.fstype))
1460 # record logs for the MDS lov
1461 for uuid in self.filesystem_uuids:
1462 log("recording clients for filesystem:", uuid)
1463 fs = self.db.lookup(uuid)
1464 obd_uuid = fs.get_first_ref('obd')
1465 client_uuid = generate_client_uuid(self.name)
1466 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1469 lctl.clear_log(self.name, self.name)
1470 lctl.record(self.name, self.name)
1472 lctl.mount_option(self.name, client.get_name(), "")
1476 lctl.clear_log(self.name, self.name + '-clean')
1477 lctl.record(self.name, self.name + '-clean')
1479 lctl.del_mount_option(self.name)
1484 # record logs for each client
1486 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1488 config_options = CONFIG_FILE
1490 for node_db in self.db.lookup_class('node'):
1491 client_name = node_db.getName()
1492 for prof_uuid in node_db.get_refs('profile'):
1493 prof_db = node_db.lookup(prof_uuid)
1494 # refactor this into a funtion to test "clientness"
1496 for ref_class, ref_uuid in prof_db.get_all_refs():
1497 if ref_class in ('mountpoint','echoclient'):
1498 debug("recording", client_name)
1499 old_noexec = config.noexec
1501 noexec_opt = ('', '-n')
1502 ret, out = run (sys.argv[0],
1503 noexec_opt[old_noexec == 1],
1504 " -v --record --nomod",
1505 "--record_log", client_name,
1506 "--record_device", self.name,
1507 "--node", client_name,
1510 for s in out: log("record> ", string.strip(s))
1511 ret, out = run (sys.argv[0],
1512 noexec_opt[old_noexec == 1],
1513 "--cleanup -v --record --nomod",
1514 "--record_log", client_name + "-clean",
1515 "--record_device", self.name,
1516 "--node", client_name,
1519 for s in out: log("record> ", string.strip(s))
1520 config.noexec = old_noexec
1522 lctl.cleanup(self.name, self.uuid, 0, 0)
1523 except CommandError, e:
1524 log(self.module_name, "cleanup failed: ", self.name)
1527 Module.cleanup(self)
1528 clean_loop(self.devpath)
1530 def msd_remaining(self):
1531 out = lctl.device_list()
1533 if string.split(s)[2] in ('mds',):
1536 def safe_to_clean(self):
1539 def safe_to_clean_modules(self):
1540 return not self.msd_remaining()
1544 debug(self.uuid, "not active")
1547 if is_prepared(self.name):
1549 lctl.cleanup(self.name, self.uuid, config.force,
1551 except CommandError, e:
1552 log(self.module_name, "cleanup failed: ", self.name)
1555 Module.cleanup(self)
1556 if not self.msd_remaining() and is_prepared('MDT'):
1558 lctl.cleanup("MDT", "MDT_UUID", config.force,
1560 except CommandError, e:
1561 print "cleanup failed: ", self.name
1564 clean_loop(self.devpath)
1567 def __init__(self, db):
1568 Module.__init__(self, 'OSD', db)
1569 self.osdtype = self.db.get_val('osdtype')
1570 self.devpath = self.db.get_val('devpath', '')
1571 self.size = self.db.get_val_int('devsize', 0)
1572 self.journal_size = self.db.get_val_int('journalsize', 0)
1573 self.inode_size = self.db.get_val_int('inodesize', 0)
1574 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1575 self.fstype = self.db.get_val('fstype', '')
1576 self.nspath = self.db.get_val('nspath', '')
1577 target_uuid = self.db.get_first_ref('target')
1578 ost = self.db.lookup(target_uuid)
1579 self.name = ost.getName()
1580 self.format = self.db.get_val('autoformat', 'yes')
1581 if ost.get_val('failover', 0):
1582 self.failover_ost = 'f'
1584 self.failover_ost = 'n'
1586 active_uuid = get_active_target(ost)
1588 panic("No target device found:", target_uuid)
1589 if active_uuid == self.uuid:
1593 if self.active and config.group and config.group != ost.get_val('group'):
1596 self.target_dev_uuid = self.uuid
1597 self.uuid = target_uuid
1599 self.add_lustre_module('ost', 'ost')
1600 # FIXME: should we default to ext3 here?
1601 if self.fstype == 'ldiskfs':
1602 self.add_lustre_module('ldiskfs', 'ldiskfs')
1604 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1605 self.add_lustre_module(self.osdtype, self.osdtype)
1607 def load_module(self):
1609 Module.load_module(self)
1611 # need to check /proc/mounts and /etc/mtab before
1612 # formatting anything.
1613 # FIXME: check if device is already formatted.
1615 if is_prepared(self.name):
1618 debug(self.uuid, "not active")
1620 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1621 self.format, self.journal_size, self.inode_size)
1623 if self.osdtype == 'obdecho':
1626 blkdev = block_dev(self.devpath, self.size, self.fstype,
1627 config.reformat, self.format, self.journal_size,
1628 self.inode_size, self.mkfsoptions)
1629 lctl.newdev(self.osdtype, self.name, self.uuid,
1630 setup ="%s %s %s" %(blkdev, self.fstype,
1632 if not is_prepared('OSS'):
1633 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1635 def osd_remaining(self):
1636 out = lctl.device_list()
1638 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1641 def safe_to_clean(self):
1644 def safe_to_clean_modules(self):
1645 return not self.osd_remaining()
1649 debug(self.uuid, "not active")
1651 if is_prepared(self.name):
1654 lctl.cleanup(self.name, self.uuid, config.force,
1656 except CommandError, e:
1657 log(self.module_name, "cleanup failed: ", self.name)
1660 if not self.osd_remaining() and is_prepared('OSS'):
1662 lctl.cleanup("OSS", "OSS_UUID", config.force,
1664 except CommandError, e:
1665 print "cleanup failed: ", self.name
1668 if not self.osdtype == 'obdecho':
1669 clean_loop(self.devpath)
1671 def mgmt_uuid_for_fs(mtpt_name):
1674 mtpt_db = toplustreDB.lookup_name(mtpt_name)
1675 fs_uuid = mtpt_db.get_first_ref('filesystem')
1676 fs = toplustreDB.lookup(fs_uuid)
1679 return fs.get_first_ref('mgmt')
1681 # Generic client module, used by OSC and MDC
1682 class Client(Module):
1683 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1685 self.target_name = tgtdb.getName()
1686 self.target_uuid = tgtdb.getUUID()
1689 self.tgt_dev_uuid = get_active_target(tgtdb)
1690 if not self.tgt_dev_uuid:
1691 panic("No target device found for target:", self.target_name)
1693 self.kmod = kmod(config.lustre, config.portals)
1697 self.module = module
1698 self.module_name = string.upper(module)
1700 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1701 self.target_name, fs_name)
1703 self.name = self_name
1705 self.lookup_server(self.tgt_dev_uuid)
1706 mgmt_uuid = mgmt_uuid_for_fs(fs_name)
1708 self.mgmt_name = mgmtcli_name_for_uuid(mgmt_uuid)
1711 self.fs_name = fs_name
1714 self.add_lustre_module(module_dir, module)
1716 def lookup_server(self, srv_uuid):
1717 """ Lookup a server's network information """
1718 self._server_nets = get_ost_net(self.db, srv_uuid)
1719 if len(self._server_nets) == 0:
1720 panic ("Unable to find a server for:", srv_uuid)
1722 def get_servers(self):
1723 return self._server_nets
1725 def prepare(self, ignore_connect_failure = 0):
1726 self.info(self.target_uuid)
1727 if is_prepared(self.name):
1730 srv = choose_local_server(self.get_servers())
1734 routes = find_route(self.get_servers())
1735 if len(routes) == 0:
1736 panic ("no route to", self.target_uuid)
1737 for (srv, r) in routes:
1738 lctl.add_route_host(r[0], srv.nid_uuid, r[1], r[3])
1739 except CommandError, e:
1740 if not ignore_connect_failure:
1743 if self.target_uuid in config.inactive and self.permits_inactive():
1744 debug("%s inactive" % self.target_uuid)
1745 inactive_p = "inactive"
1747 debug("%s active" % self.target_uuid)
1749 lctl.newdev(self.module, self.name, self.uuid,
1750 setup ="%s %s %s %s" % (self.target_uuid, srv.nid_uuid,
1751 inactive_p, self.mgmt_name))
1754 if is_prepared(self.name):
1755 Module.cleanup(self)
1757 srv = choose_local_server(self.get_servers())
1759 lctl.disconnect(srv)
1761 for (srv, r) in find_route(self.get_servers()):
1762 lctl.del_route_host(r[0], srv.nid_uuid, r[1], r[3])
1763 except CommandError, e:
1764 log(self.module_name, "cleanup failed: ", self.name)
1770 def __init__(self, db, uuid, fs_name):
1771 Client.__init__(self, db, uuid, 'mdc', fs_name)
1773 def permits_inactive(self):
1777 def __init__(self, db, uuid, fs_name):
1778 Client.__init__(self, db, uuid, 'osc', fs_name)
1780 def permits_inactive(self):
1783 def mgmtcli_name_for_uuid(uuid):
1784 return 'MGMTCLI_%s' % uuid
1786 class ManagementClient(Client):
1787 def __init__(self, db, uuid):
1788 Client.__init__(self, db, uuid, 'mgmt_cli', '',
1789 self_name = mgmtcli_name_for_uuid(db.getUUID()),
1790 module_dir = 'mgmt')
1793 def __init__(self, db):
1794 Module.__init__(self, 'COBD', db)
1795 self.real_uuid = self.db.get_first_ref('realobd')
1796 self.cache_uuid = self.db.get_first_ref('cacheobd')
1797 self.add_lustre_module('cobd' , 'cobd')
1799 # need to check /proc/mounts and /etc/mtab before
1800 # formatting anything.
1801 # FIXME: check if device is already formatted.
1803 if is_prepared(self.name):
1805 self.info(self.real_uuid, self.cache_uuid)
1806 lctl.newdev("cobd", self.name, self.uuid,
1807 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1810 # virtual interface for OSC and LOV
1812 def __init__(self, db, uuid, fs_name, name_override = None):
1813 Module.__init__(self, 'VOSC', db)
1814 if db.get_class() == 'lov':
1815 self.osc = LOV(db, uuid, fs_name, name_override)
1817 self.osc = get_osc(db, uuid, fs_name)
1819 return self.osc.uuid
1821 return self.osc.name
1826 def load_module(self):
1827 self.osc.load_module()
1828 def cleanup_module(self):
1829 self.osc.cleanup_module()
1832 class ECHO_CLIENT(Module):
1833 def __init__(self,db):
1834 Module.__init__(self, 'ECHO_CLIENT', db)
1835 self.add_lustre_module('obdecho', 'obdecho')
1836 self.obd_uuid = self.db.get_first_ref('obd')
1837 obd = self.db.lookup(self.obd_uuid)
1838 self.uuid = generate_client_uuid(self.name)
1839 self.osc = VOSC(obd, self.uuid, self.name)
1842 if is_prepared(self.name):
1845 self.osc.prepare() # XXX This is so cheating. -p
1846 self.info(self.obd_uuid)
1848 lctl.newdev("echo_client", self.name, self.uuid,
1849 setup = self.osc.get_name())
1852 if is_prepared(self.name):
1853 Module.cleanup(self)
1856 def load_module(self):
1857 self.osc.load_module()
1858 Module.load_module(self)
1860 def cleanup_module(self):
1861 Module.cleanup_module(self)
1862 self.osc.cleanup_module()
1865 def generate_client_uuid(name):
1866 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1868 int(random.random() * 1048576),
1869 int(random.random() * 1048576))
1870 return client_uuid[:36]
1873 def my_rstrip(s, chars):
1874 """my_rstrip(s, chars) -> strips any instances of the characters
1875 found in chars from the right side of string s"""
1876 # XXX required because python versions pre 2.2.3 don't allow
1877 #string.rstrip() to take alternate char lists
1881 ns = string.rstrip(s, '/')
1882 except TypeError, e:
1883 for i in range(len(s) - 1, 0, -1):
1892 class Mountpoint(Module):
1893 def __init__(self,db):
1894 Module.__init__(self, 'MTPT', db)
1895 self.path = my_rstrip(self.db.get_val('path'), '/')
1896 self.fs_uuid = self.db.get_first_ref('filesystem')
1897 fs = self.db.lookup(self.fs_uuid)
1898 self.mds_uuid = fs.get_first_ref('mds')
1899 self.obd_uuid = fs.get_first_ref('obd')
1900 self.mgmt_uuid = fs.get_first_ref('mgmt')
1901 obd = self.db.lookup(self.obd_uuid)
1902 client_uuid = generate_client_uuid(self.name)
1903 self.vosc = VOSC(obd, client_uuid, self.name)
1904 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1906 self.add_lustre_module('mdc', 'mdc')
1907 self.add_lustre_module('llite', 'llite')
1909 self.mgmtcli = ManagementClient(db.lookup(self.mgmt_uuid),
1915 if fs_is_mounted(self.path):
1916 log(self.path, "already mounted.")
1920 self.mgmtcli.prepare()
1923 mdc_name = self.mdc.name
1925 self.info(self.path, self.mds_uuid, self.obd_uuid)
1926 if config.record or config.lctl_dump:
1927 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1929 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1930 (self.vosc.get_name(), mdc_name, config.config, self.path)
1931 run("mkdir", self.path)
1936 panic("mount failed:", self.path, ":", string.join(val))
1939 self.info(self.path, self.mds_uuid,self.obd_uuid)
1941 if config.record or config.lctl_dump:
1942 lctl.del_mount_option(local_node_name)
1944 if fs_is_mounted(self.path):
1946 (rc, out) = run("umount", "-f", self.path)
1948 (rc, out) = run("umount", self.path)
1950 raise CommandError('umount', out, rc)
1952 if fs_is_mounted(self.path):
1953 panic("fs is still mounted:", self.path)
1958 self.mgmtcli.cleanup()
1960 def load_module(self):
1962 self.mgmtcli.load_module()
1963 self.vosc.load_module()
1964 Module.load_module(self)
1966 def cleanup_module(self):
1967 Module.cleanup_module(self)
1968 self.vosc.cleanup_module()
1970 self.mgmtcli.cleanup_module()
1973 # ============================================================
1974 # misc query functions
1976 def get_ost_net(self, osd_uuid):
1980 osd = self.lookup(osd_uuid)
1981 node_uuid = osd.get_first_ref('node')
1982 node = self.lookup(node_uuid)
1984 panic("unable to find node for osd_uuid:", osd_uuid,
1985 " node_ref:", node_uuid)
1986 for net_uuid in node.get_networks():
1987 db = node.lookup(net_uuid)
1988 srv_list.append(Network(db))
1992 # the order of iniitailization is based on level.
1993 def getServiceLevel(self):
1994 type = self.get_class()
1996 if type in ('network',):
1998 elif type in ('routetbl',):
2000 elif type in ('ldlm',):
2002 elif type in ('mgmt',):
2004 elif type in ('osd', 'cobd'):
2006 elif type in ('mdsdev',):
2008 elif type in ('mountpoint', 'echoclient'):
2011 panic("Unknown type: ", type)
2013 if ret < config.minlevel or ret > config.maxlevel:
2018 # return list of services in a profile. list is a list of tuples
2019 # [(level, db_object),]
2020 def getServices(self):
2022 for ref_class, ref_uuid in self.get_all_refs():
2023 servdb = self.lookup(ref_uuid)
2025 level = getServiceLevel(servdb)
2027 list.append((level, servdb))
2029 panic('service not found: ' + ref_uuid)
2035 ############################################################
2037 # FIXME: clean this mess up!
2039 # OSC is no longer in the xml, so we have to fake it.
2040 # this is getting ugly and begging for another refactoring
2041 def get_osc(ost_db, uuid, fs_name):
2042 osc = OSC(ost_db, uuid, fs_name)
2045 def get_mdc(db, uuid, fs_name, mds_uuid):
2046 mds_db = db.lookup(mds_uuid);
2048 panic("no mds:", mds_uuid)
2049 mdc = MDC(mds_db, uuid, fs_name)
2052 ############################################################
2053 # routing ("rooting")
2055 # list of (nettype, cluster_id, nid)
2058 def find_local_clusters(node_db):
2059 global local_clusters
2060 for netuuid in node_db.get_networks():
2061 net = node_db.lookup(netuuid)
2063 debug("add_local", netuuid)
2064 local_clusters.append((srv.net_type, srv.cluster_id, srv.nid))
2066 if acceptors.has_key(srv.port):
2067 panic("duplicate port:", srv.port)
2068 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
2069 srv.send_mem, srv.recv_mem,
2072 # This node is a gateway.
2074 def node_is_router():
2077 # If there are any routers found in the config, then this will be true
2078 # and all nodes will load kptlrouter.
2080 def node_needs_router():
2081 return needs_router or is_router
2083 # list of (nettype, gw, tgt_cluster_id, lo, hi)
2084 # Currently, these local routes are only added to kptlrouter route
2085 # table if they are needed to connect to a specific server. This
2086 # should be changed so all available routes are loaded, and the
2087 # ptlrouter can make all the decisions.
2090 def find_local_routes(lustre):
2091 """ Scan the lustre config looking for routers . Build list of
2093 global local_routes, needs_router
2095 list = lustre.lookup_class('node')
2097 if router.get_val_int('router', 0):
2099 for (local_type, local_cluster_id, local_nid) in local_clusters:
2101 for netuuid in router.get_networks():
2102 db = router.lookup(netuuid)
2103 if (local_type == db.get_val('nettype') and
2104 local_cluster_id == db.get_val('clusterid')):
2105 gw = db.get_val('nid')
2108 debug("find_local_routes: gw is", gw)
2109 for route in router.get_local_routes(local_type, gw):
2110 local_routes.append(route)
2111 debug("find_local_routes:", local_routes)
2114 def choose_local_server(srv_list):
2115 for srv in srv_list:
2116 if local_cluster(srv.net_type, srv.cluster_id):
2119 def local_cluster(net_type, cluster_id):
2120 for cluster in local_clusters:
2121 if net_type == cluster[0] and cluster_id == cluster[1]:
2125 def local_interface(net_type, cluster_id, nid):
2126 for cluster in local_clusters:
2127 if (net_type == cluster[0] and cluster_id == cluster[1]
2128 and nid == cluster[2]):
2132 def find_route(srv_list):
2134 frm_type = local_clusters[0][0]
2135 for srv in srv_list:
2136 debug("find_route: srv:", srv.nid, "type: ", srv.net_type)
2137 to_type = srv.net_type
2139 cluster_id = srv.cluster_id
2140 debug ('looking for route to', to_type, to)
2141 for r in local_routes:
2142 debug("find_route: ", r)
2143 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
2144 result.append((srv, r))
2147 def get_active_target(db):
2148 target_uuid = db.getUUID()
2149 target_name = db.getName()
2150 node_name = get_select(target_name)
2152 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2154 tgt_dev_uuid = db.get_first_ref('active')
2157 def get_server_by_nid_uuid(db, nid_uuid):
2158 for n in db.lookup_class("network"):
2160 if net.nid_uuid == nid_uuid:
2164 ############################################################
2168 type = db.get_class()
2169 debug('Service:', type, db.getName(), db.getUUID())
2174 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2175 elif type == 'network':
2177 elif type == 'routetbl':
2181 elif type == 'cobd':
2183 elif type == 'mdsdev':
2185 elif type == 'mountpoint':
2187 elif type == 'echoclient':
2189 elif type == 'mgmt':
2192 panic ("unknown service type:", type)
2196 # Prepare the system to run lustre using a particular profile
2197 # in a the configuration.
2198 # * load & the modules
2199 # * setup networking for the current node
2200 # * make sure partitions are in place and prepared
2201 # * initialize devices with lctl
2202 # Levels is important, and needs to be enforced.
2203 def for_each_profile(db, prof_list, operation):
2204 for prof_uuid in prof_list:
2205 prof_db = db.lookup(prof_uuid)
2207 panic("profile:", profile, "not found.")
2208 services = getServices(prof_db)
2211 def doWriteconf(services):
2215 if s[1].get_class() == 'mdsdev':
2216 n = newService(s[1])
2219 def doSetup(services):
2223 n = newService(s[1])
2226 def doModules(services):
2230 n = newService(s[1])
2233 def doCleanup(services):
2238 n = newService(s[1])
2239 if n.safe_to_clean():
2242 def doUnloadModules(services):
2247 n = newService(s[1])
2248 if n.safe_to_clean_modules():
2253 def doHost(lustreDB, hosts):
2254 global is_router, local_node_name
2257 node_db = lustreDB.lookup_name(h, 'node')
2261 panic('No host entry found.')
2263 local_node_name = node_db.get_val('name', 0)
2264 is_router = node_db.get_val_int('router', 0)
2265 lustre_upcall = node_db.get_val('lustreUpcall', '')
2266 portals_upcall = node_db.get_val('portalsUpcall', '')
2267 timeout = node_db.get_val_int('timeout', 0)
2268 ptldebug = node_db.get_val('ptldebug', '')
2269 subsystem = node_db.get_val('subsystem', '')
2271 find_local_clusters(node_db)
2273 find_local_routes(lustreDB)
2275 # Two step process: (1) load modules, (2) setup lustre
2276 # if not cleaning, load modules first.
2277 prof_list = node_db.get_refs('profile')
2279 if config.write_conf:
2281 for_each_profile(node_db, prof_list, doModules)
2283 for_each_profile(node_db, prof_list, doWriteconf)
2284 for_each_profile(node_db, prof_list, doUnloadModules)
2286 elif config.recover:
2287 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2288 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2289 "--client_uuid <UUID> --conn_uuid <UUID>")
2290 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2292 elif config.cleanup:
2294 # the command line can override this value
2296 # ugly hack, only need to run lctl commands for --dump
2297 if config.lctl_dump or config.record:
2298 for_each_profile(node_db, prof_list, doCleanup)
2301 sys_set_timeout(timeout)
2302 sys_set_ptldebug(ptldebug)
2303 sys_set_subsystem(subsystem)
2304 sys_set_lustre_upcall(lustre_upcall)
2305 sys_set_portals_upcall(portals_upcall)
2307 for_each_profile(node_db, prof_list, doCleanup)
2308 for_each_profile(node_db, prof_list, doUnloadModules)
2312 # ugly hack, only need to run lctl commands for --dump
2313 if config.lctl_dump or config.record:
2314 sys_set_timeout(timeout)
2315 sys_set_lustre_upcall(lustre_upcall)
2316 for_each_profile(node_db, prof_list, doSetup)
2320 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2321 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2323 for_each_profile(node_db, prof_list, doModules)
2325 sys_set_debug_path()
2326 sys_set_ptldebug(ptldebug)
2327 sys_set_subsystem(subsystem)
2328 script = config.gdb_script
2329 run(lctl.lctl, ' modules >', script)
2331 log ("The GDB module script is in", script)
2332 # pause, so user has time to break and
2335 sys_set_timeout(timeout)
2336 sys_set_lustre_upcall(lustre_upcall)
2337 sys_set_portals_upcall(portals_upcall)
2339 for_each_profile(node_db, prof_list, doSetup)
2342 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2343 tgt = lustreDB.lookup(tgt_uuid)
2345 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2346 new_uuid = get_active_target(tgt)
2348 raise Lustre.LconfError("doRecovery: no active target found for: " +
2350 net = choose_local_server(get_ost_net(lustreDB, new_uuid))
2352 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2354 log("Reconnecting", tgt_uuid, " to ", net.nid_uuid);
2356 oldnet = get_server_by_nid_uuid(lustreDB, nid_uuid)
2359 lctl.disconnect(oldnet)
2360 except CommandError, e:
2361 log("recover: disconnect", nid_uuid, "failed: ")
2366 except CommandError, e:
2367 log("recover: connect failed")
2370 lctl.recover(client_uuid, net.nid_uuid)
2373 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2374 base = os.path.dirname(cmd)
2375 if development_mode():
2376 if not config.lustre:
2377 debug('using objdir module paths')
2378 config.lustre = (os.path.join(base, ".."))
2379 # normalize the portals dir, using command line arg if set
2381 portals_dir = config.portals
2382 dir = os.path.join(config.lustre, portals_dir)
2383 config.portals = dir
2384 debug('config.portals', config.portals)
2385 elif config.lustre and config.portals:
2387 # if --lustre and --portals, normalize portals
2388 # can ignore POTRALS_DIR here, since it is probly useless here
2389 config.portals = os.path.join(config.lustre, config.portals)
2390 debug('config.portals B', config.portals)
2392 def sysctl(path, val):
2393 debug("+ sysctl", path, val)
2397 fp = open(os.path.join('/proc/sys', path), 'w')
2404 def sys_set_debug_path():
2405 sysctl('portals/debug_path', config.debug_path)
2407 def sys_set_lustre_upcall(upcall):
2408 # the command overrides the value in the node config
2409 if config.lustre_upcall:
2410 upcall = config.lustre_upcall
2412 upcall = config.upcall
2414 lctl.set_lustre_upcall(upcall)
2416 def sys_set_portals_upcall(upcall):
2417 # the command overrides the value in the node config
2418 if config.portals_upcall:
2419 upcall = config.portals_upcall
2421 upcall = config.upcall
2423 sysctl('portals/upcall', upcall)
2425 def sys_set_timeout(timeout):
2426 # the command overrides the value in the node config
2427 if config.timeout and config.timeout > 0:
2428 timeout = config.timeout
2429 if timeout != None and timeout > 0:
2430 lctl.set_timeout(timeout)
2432 def sys_tweak_socknal ():
2433 if config.single_socket:
2434 sysctl("socknal/typed", 0)
2436 def sys_optimize_elan ():
2437 procfiles = ["/proc/elan/config/eventint_punt_loops",
2438 "/proc/qsnet/elan3/config/eventint_punt_loops",
2439 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2441 if os.access(p, os.W_OK):
2442 run ("echo 1 > " + p)
2444 def sys_set_ptldebug(ptldebug):
2446 ptldebug = config.ptldebug
2449 val = eval(ptldebug, ptldebug_names)
2450 val = "0x%x" % (val)
2451 sysctl('portals/debug', val)
2452 except NameError, e:
2455 def sys_set_subsystem(subsystem):
2456 if config.subsystem:
2457 subsystem = config.subsystem
2460 val = eval(subsystem, subsystem_names)
2461 val = "0x%x" % (val)
2462 sysctl('portals/subsystem_debug', val)
2463 except NameError, e:
2466 def sys_set_netmem_max(path, max):
2467 debug("setting", path, "to at least", max)
2475 fp = open(path, 'w')
2476 fp.write('%d\n' %(max))
2480 def sys_make_devices():
2481 if not os.access('/dev/portals', os.R_OK):
2482 run('mknod /dev/portals c 10 240')
2483 if not os.access('/dev/obd', os.R_OK):
2484 run('mknod /dev/obd c 10 241')
2487 # Add dir to the global PATH, if not already there.
2488 def add_to_path(new_dir):
2489 syspath = string.split(os.environ['PATH'], ':')
2490 if new_dir in syspath:
2492 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2494 def default_debug_path():
2495 path = '/tmp/lustre-log'
2496 if os.path.isdir('/r'):
2501 def default_gdb_script():
2502 script = '/tmp/ogdb'
2503 if os.path.isdir('/r'):
2504 return '/r' + script
2509 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2510 # ensure basic elements are in the system path
2511 def sanitise_path():
2512 for dir in DEFAULT_PATH:
2515 # global hack for the --select handling
2517 def init_select(args):
2518 # args = [service=nodeA,service2=nodeB service3=nodeC]
2521 list = string.split(arg, ',')
2523 srv, node = string.split(entry, '=')
2524 tgt_select[srv] = node
2526 def get_select(srv):
2527 if tgt_select.has_key(srv):
2528 return tgt_select[srv]
2532 FLAG = Lustre.Options.FLAG
2533 PARAM = Lustre.Options.PARAM
2534 INTPARAM = Lustre.Options.INTPARAM
2535 PARAMLIST = Lustre.Options.PARAMLIST
2537 ('verbose,v', "Print system commands as they are run"),
2538 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2539 ('config', "Cluster config name used for LDAP query", PARAM),
2540 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2541 ('node', "Load config for <nodename>", PARAM),
2542 ('cleanup,d', "Cleans up config. (Shutdown)"),
2543 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2545 ('single_socket', "socknal option: only use one socket instead of bundle",
2547 ('failover',"""Used to shut down without saving state.
2548 This will allow this node to "give up" a service to a
2549 another node for failover purposes. This will not
2550 be a clean shutdown.""",
2552 ('gdb', """Prints message after creating gdb module script
2553 and sleeps for 5 seconds."""),
2554 ('noexec,n', """Prints the commands and steps that will be run for a
2555 config without executing them. This can used to check if a
2556 config file is doing what it should be doing"""),
2557 ('nomod', "Skip load/unload module step."),
2558 ('nosetup', "Skip device setup/cleanup step."),
2559 ('reformat', "Reformat all devices (without question)"),
2560 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2561 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2563 ('write_conf', "Save all the client config information on mds."),
2564 ('record', "Write config information on mds."),
2565 ('record_log', "Name of config record log.", PARAM),
2566 ('record_device', "MDS device name that will record the config commands",
2568 ('minlevel', "Minimum level of services to configure/cleanup",
2570 ('maxlevel', """Maximum level of services to configure/cleanup
2571 Levels are aproximatly like:
2576 70 - mountpoint, echo_client, osc, mdc, lov""",
2578 ('lustre', """Base directory of lustre sources. This parameter will
2579 cause lconf to load modules from a source tree.""", PARAM),
2580 ('portals', """Portals source directory. If this is a relative path,
2581 then it is assumed to be relative to lustre. """, PARAM),
2582 ('timeout', "Set recovery timeout", INTPARAM),
2583 ('upcall', "Set both portals and lustre upcall script", PARAM),
2584 ('lustre_upcall', "Set lustre upcall script", PARAM),
2585 ('portals_upcall', "Set portals upcall script", PARAM),
2586 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2587 ('ptldebug', "Set the portals debug level", PARAM),
2588 ('subsystem', "Set the portals debug subsystem", PARAM),
2589 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2590 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2591 # Client recovery options
2592 ('recover', "Recover a device"),
2593 ('group', "The group of devices to configure or cleanup", PARAM),
2594 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2595 ('client_uuid', "The failed client (required for recovery)", PARAM),
2596 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2598 ('inactive', """The name of an inactive service, to be ignored during
2599 mounting (currently OST-only). Can be repeated.""",
2604 global lctl, config, toplustreDB, CONFIG_FILE
2606 # in the upcall this is set to SIG_IGN
2607 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2609 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2611 config, args = cl.parse(sys.argv[1:])
2612 except Lustre.OptionError, e:
2616 setupModulePath(sys.argv[0])
2618 host = socket.gethostname()
2620 # the PRNG is normally seeded with time(), which is not so good for starting
2621 # time-synchronized clusters
2622 input = open('/dev/urandom', 'r')
2624 print 'Unable to open /dev/urandom!'
2626 seed = input.read(32)
2632 init_select(config.select)
2635 # allow config to be fetched via HTTP, but only with python2
2636 if sys.version[0] != '1' and args[0].startswith('http://'):
2639 config_file = urllib2.urlopen(args[0])
2640 except (urllib2.URLError, socket.error), err:
2641 if hasattr(err, 'args'):
2643 print "Could not access '%s': %s" %(args[0], err)
2645 elif not os.access(args[0], os.R_OK):
2646 print 'File not found or readable:', args[0]
2650 config_file = open(args[0], 'r')
2652 dom = xml.dom.minidom.parse(config_file)
2654 panic("%s does not appear to be a config file." % (args[0]))
2655 sys.exit(1) # make sure to die here, even in debug mode.
2657 CONFIG_FILE = args[0]
2658 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2659 if not config.config:
2660 config.config = os.path.basename(args[0])# use full path?
2661 if config.config[-4:] == '.xml':
2662 config.config = config.config[:-4]
2663 elif config.ldapurl:
2664 if not config.config:
2665 panic("--ldapurl requires --config name")
2666 dn = "config=%s,fs=lustre" % (config.config)
2667 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2668 elif config.ptldebug or config.subsystem:
2669 sys_set_ptldebug(None)
2670 sys_set_subsystem(None)
2673 print 'Missing config file or ldap URL.'
2674 print 'see lconf --help for command summary'
2677 toplustreDB = lustreDB
2679 ver = lustreDB.get_version()
2681 panic("No version found in config data, please recreate.")
2682 if ver != Lustre.CONFIG_VERSION:
2683 panic("Config version", ver, "does not match lconf version",
2684 Lustre.CONFIG_VERSION)
2688 node_list.append(config.node)
2691 node_list.append(host)
2692 node_list.append('localhost')
2694 debug("configuring for host: ", node_list)
2697 config.debug_path = config.debug_path + '-' + host
2698 config.gdb_script = config.gdb_script + '-' + host
2700 lctl = LCTLInterface('lctl')
2702 if config.lctl_dump:
2703 lctl.use_save_file(config.lctl_dump)
2706 if not (config.record_device and config.record_log):
2707 panic("When recording, both --record_log and --record_device must be specified.")
2708 lctl.clear_log(config.record_device, config.record_log)
2709 lctl.record(config.record_device, config.record_log)
2711 doHost(lustreDB, node_list)
2716 if __name__ == "__main__":
2719 except Lustre.LconfError, e:
2721 # traceback.print_exc(file=sys.stdout)
2723 except CommandError, e:
2727 if first_cleanup_error:
2728 sys.exit(first_cleanup_error)