5 # DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
7 # This program is free software; you can redistribute it and/or modify
8 # it under the terms of the GNU General Public License version 2 only,
9 # as published by the Free Software Foundation.
11 # This program is distributed in the hope that it will be useful, but
12 # WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 # General Public License version 2 for more details (a copy is included
15 # in the LICENSE file that accompanied this code).
17 # You should have received a copy of the GNU General Public License
18 # version 2 along with this program; If not, see
19 # http://www.sun.com/software/products/lustre/docs/GPLv2.pdf
22 # Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara,
23 # CA 95054 USA or visit www.sun.com if you need additional information or
30 # Copyright 2008 Sun Microsystems, Inc. All rights reserved
31 # Use is subject to license terms.
35 # This file is part of Lustre, http://www.lustre.org/
36 # Lustre is a trademark of Sun Microsystems, Inc.
38 # Author: Robert Read <rread@clusterfs.com>
39 # Author: Mike Shaver <shaver@clusterfs.com>
41 # lconf - lustre configuration tool
43 # lconf is the main driver script for starting and stopping
44 # lustre filesystem services.
46 # Based in part on the XML obdctl modifications done by Brian Behlendorf
48 import sys, getopt, types, errno
49 import string, os, stat, popen2, socket, time, random, fcntl, select
50 import re, exceptions, signal, traceback
51 import xml.dom.minidom
53 if sys.version[0] == '1':
54 from FCNTL import F_GETFL, F_SETFL
56 from fcntl import F_GETFL, F_SETFL
58 PYMOD_DIR = ["/usr/lib64/lustre/python", "/usr/lib/lustre/python"]
61 if string.find(sys.platform, 'linux') != -1:
63 elif string.find(sys.platform, 'darwin') != -1:
65 KEXTPATH='/System/Library/Extensions/'
67 PLATFORM='Unsupported'
69 def development_mode():
70 base = os.path.dirname(sys.argv[0])
71 if os.access(base+"/Makefile", os.R_OK):
75 if development_mode():
76 sys.path.append('../utils')
78 sys.path.extend(PYMOD_DIR)
85 # Maximum number of devices to search for.
86 # (the /dev/loop* nodes need to be created beforehand)
87 MAX_LOOP_DEVICES = 256
88 PORTALS_DIR = '../lnet'
90 # Needed to call lconf --record
93 # Please keep these in sync with the values in lnet/include/libcfs/libcfs.h
105 "warning" : (1 << 10),
108 "dentry" : (1 << 13),
109 "portals" : (1 << 14), # deprecated
112 "dlmtrace" : (1 << 16),
116 "rpctrace" : (1 << 20),
117 "vfstrace" : (1 << 21),
120 "config" : (1 << 24),
121 "console" : (1 << 25),
127 "undefined" : (1 << 0),
137 "portals" : (1 << 10), # deprecated
139 "nal" : (1 << 11), # deprecated
140 "pinger" : (1 << 12),
141 "filter" : (1 << 13),
142 "ptlbd" : (1 << 14), # deprecated
146 "ptlrouter" : (1 << 18), # deprecated
150 "confobd" : (1 << 22), # deprecated
162 first_cleanup_error = 0
163 def cleanup_error(rc):
164 global first_cleanup_error
165 if not first_cleanup_error:
166 first_cleanup_error = rc
168 # ============================================================
169 # debugging and error funcs
171 def fixme(msg = "this feature"):
172 raise Lustre.LconfError, msg + ' not implemented yet.'
175 msg = string.join(map(str,args))
176 if not config.noexec:
177 raise Lustre.LconfError(msg)
182 msg = string.join(map(str,args))
187 print string.strip(s)
190 # apparently, (non)execution of the following line affects mds device
191 # startup order (e.g. two mds's using loopback devices), so always do it.
192 msg = string.join(map(str,args))
196 # ack, python's builtin int() does not support '0x123' syntax.
197 # eval can do it, although what a hack!
200 if type(s) is types.IntType:
203 if (s[0:2] == '0x') or (s[0:1] == '0'):
204 return eval(s, {}, {})
207 except SyntaxError, e:
208 raise ValueError("not a number")
210 raise ValueError("not a number")
212 raise ValueError("not a number")
214 # ============================================================
215 # locally defined exceptions
216 class CommandError (exceptions.Exception):
217 def __init__(self, cmd_name, cmd_err, rc=None):
218 self.cmd_name = cmd_name
219 self.cmd_err = cmd_err
224 if type(self.cmd_err) == types.StringType:
226 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
228 print "! %s: %s" % (self.cmd_name, self.cmd_err)
229 elif type(self.cmd_err) == types.ListType:
231 print "! %s (error %d):" % (self.cmd_name, self.rc)
233 print "! %s:" % (self.cmd_name)
234 for s in self.cmd_err:
235 print "> %s" %(string.strip(s))
239 # ============================================================
240 # handle lctl interface
243 Manage communication with lctl
246 def __init__(self, cmd):
248 Initialize close by finding the lctl binary.
250 self.lctl = find_prog(cmd)
252 self.record_device = ''
255 debug('! lctl not found')
258 raise CommandError('lctl', "unable to find lctl binary.")
260 def use_save_file(self, file):
261 self.save_file = file
263 def record(self, dev_name, logname):
264 log("Recording log", logname, "on", dev_name)
265 self.record_device = dev_name
266 self.record_log = logname
268 def end_record(self):
269 log("End recording log", self.record_log, "on", self.record_device)
270 self.record_device = None
271 self.record_log = None
273 def set_nonblock(self, fd):
274 fl = fcntl.fcntl(fd, F_GETFL)
275 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
280 the cmds are written to stdin of lctl
281 lctl doesn't return errors when run in script mode, so
283 should modify command line to accept multiple commands, or
284 create complex command line options
288 cmds = '\n dump ' + self.save_file + '\n' + cmds
289 elif self.record_device:
293 %s""" % (self.record_device, self.record_log, cmds)
295 debug("+", cmd_line, cmds)
296 if config.noexec: return (0, [])
298 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
299 child.tochild.write(cmds + "\nq\n")
300 child.tochild.close()
302 # From "Python Cookbook" from O'Reilly
303 outfile = child.fromchild
304 outfd = outfile.fileno()
305 self.set_nonblock(outfd)
306 errfile = child.childerr
307 errfd = errfile.fileno()
308 self.set_nonblock(errfd)
310 outdata = errdata = ''
313 ready = select.select([outfd,errfd],[],[]) # Wait for input
314 if outfd in ready[0]:
315 outchunk = outfile.read()
316 if outchunk == '': outeof = 1
317 outdata = outdata + outchunk
318 if errfd in ready[0]:
319 errchunk = errfile.read()
320 if errchunk == '': erreof = 1
321 errdata = errdata + errchunk
322 if outeof and erreof: break
323 # end of "borrowed" code
326 if os.WIFEXITED(ret):
327 rc = os.WEXITSTATUS(ret)
330 if rc or len(errdata):
331 raise CommandError(self.lctl, errdata, rc)
334 def runcmd(self, *args):
336 run lctl using the command line
338 cmd = string.join(map(str,args))
339 debug("+", self.lctl, cmd)
340 rc, out = run(self.lctl, cmd)
342 raise CommandError(self.lctl, out, rc)
345 def unconfigure_network(self):
346 """get lnet to unreference itself"""
348 network unconfigure"""
351 def clear_log(self, dev, log):
352 """ clear an existing log """
357 quit """ % (dev, log)
360 # create a new connection
361 def add_uuid(self, net_type, uuid, nid):
362 if net_type != 'lnet' and string.find(nid,'@') < 0:
363 nidstr = nid + "@" + net_type
366 cmds = "\n add_uuid %s %s" %(uuid, nidstr)
369 def connect(self, srv):
371 panic('nid_uuid not set for ', srv.net_type, srv.nid)
372 hostaddr = srv.db.get_hostaddr()
373 if len(hostaddr) > 1:
374 panic('multiple --hostaddr for ', srv.nid_uuid, ' not supported')
375 elif len(hostaddr) == 1 and hostaddr[0] != srv.nid:
376 panic('different --hostaddr and --nid for ', srv.nid_uuid, ' not supported')
378 self.add_uuid(srv.net_type, srv.nid_uuid, srv.nid)
381 def recover(self, dev_name, new_conn):
384 recover %s""" %(dev_name, new_conn)
387 # disconnect one connection
388 def disconnect(self, srv):
390 panic('nid_uuid not set for ', srv.net_type, srv.nid)
391 self.del_uuid(srv.nid_uuid)
393 def del_uuid(self, uuid):
400 def attach(self, type, name, uuid):
403 quit""" % (type, name, uuid)
406 def setup(self, name, setup = ""):
410 quit""" % (name, setup)
413 def abort_recovery(self, name):
421 def add_conn(self, name, conn_uuid):
425 quit""" % (name, conn_uuid)
428 # create a new device with lctl
429 def newdev(self, type, name, uuid, setup = ""):
430 self.attach(type, name, uuid);
432 self.setup(name, setup)
433 except CommandError, e:
434 self.cleanup(name, uuid, 0)
436 if (config.abort_recovery):
437 if (type == 'obdfilter' or type == 'mds'):
438 self.abort_recovery(name)
441 def cleanup(self, name, uuid, force, failover = 0):
442 if failover: force = 1
448 quit""" % (name, ('', 'force')[force],
449 ('', 'failover')[failover])
453 def lov_setup(self, name, uuid, desc_uuid, mdsuuid, stripe_cnt,
454 stripe_sz, stripe_off, pattern):
457 lov_setup %s %d %d %d %s
458 quit""" % (name, uuid, desc_uuid, stripe_cnt, stripe_sz, stripe_off, pattern)
461 # add an OBD to a LOV
462 def lov_add_obd(self, name, uuid, obd_uuid, index, gen):
465 lov_modify_tgts add %s %s %s %s
466 quit""" % (name, name, obd_uuid, index, gen)
469 # delete an OBD from a LOV
470 def lov_del_obd(self, name, uuid, obd_uuid, index, gen):
473 lov_modify_tgts del %s %s %s %s
474 quit""" % (name, name, obd_uuid, index, gen)
478 def deactivate(self, name):
486 def dump(self, dump_file):
489 quit""" % (dump_file)
492 # get list of devices
493 def device_list(self):
495 if PLATFORM == 'LINUX':
496 devices = '/proc/fs/lustre/devices'
497 if os.access(devices, os.R_OK):
499 fp = open(devices, 'r')
504 elif PLATFORM == 'DARWIN':
505 rc, out = self.run("device_list")
506 ret = out.split("\n")
511 # remove the last empty line
516 def lustre_version(self):
517 rc, out = self.runcmd('version')
521 def mount_option(self, profile, osc, mdc):
523 mount_option %s %s %s
524 quit""" % (profile, osc, mdc)
527 # delete mount options
528 def del_mount_option(self, profile):
534 def set_timeout(self, timeout):
541 def set_lustre_upcall(self, upcall):
546 # ============================================================
547 # Various system-level functions
548 # (ideally moved to their own module)
550 # Run a command and return the output and status.
551 # stderr is sent to /dev/null, could use popen3 to
552 # save it if necessary
555 if config.noexec: return (0, [])
556 f = os.popen(cmd + ' 2>&1')
566 cmd = string.join(map(str,args))
569 # Run a command in the background.
570 def run_daemon(*args):
571 cmd = string.join(map(str,args))
573 if config.noexec: return 0
574 f = os.popen(cmd + ' 2>&1')
582 # Determine full path to use for an external command
583 # searches dirname(argv[0]) first, then PATH
585 syspath = string.split(os.environ['PATH'], ':')
586 cmdpath = os.path.dirname(sys.argv[0])
587 syspath.insert(0, cmdpath);
589 syspath.insert(0, os.path.join(config.portals, 'utils/'))
591 prog = os.path.join(d,cmd)
592 if os.access(prog, os.X_OK):
596 # Recursively look for file starting at base dir
597 def do_find_file(base, mod):
598 fullname = os.path.join(base, mod)
599 if os.access(fullname, os.R_OK):
601 for d in os.listdir(base):
602 dir = os.path.join(base,d)
603 if os.path.isdir(dir):
604 module = do_find_file(dir, mod)
608 def find_module(src_dir, dev_dir, modname):
609 modbase = src_dir +'/'+ dev_dir +'/'+ modname
610 for modext in '.ko', '.o':
611 module = modbase + modext
613 if os.access(module, os.R_OK):
619 # is the path a block device?
626 return stat.S_ISBLK(s[stat.ST_MODE])
628 def my_realpath(path):
630 if os.path.islink(path):
631 # get the realpath of the mount point path
632 if 'realpath' in dir(os.path):
633 real_path = os.path.realpath(path)
637 while os.path.islink(real_path) and (link_count < 20):
638 link_count = link_count + 1
639 path_link = os.readlink(real_path)
640 if os.path.isabs(path_link):
641 real_path = path_link
643 real_path = os.path.join(os.path.dirname(real_path), path_link)
645 panic("Encountered too many symbolic links resolving path:", path)
651 panic("Fatal error realpath()ing path:", path)
654 # build fs according to type
656 def mkfs(dev, devsize, fstype, jsize, isize, mkfsoptions, isblock=1):
662 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
664 # devsize is in 1k, and fs block count is in 4k
665 block_cnt = devsize/4
667 if fstype in ('ext3', 'ldiskfs'):
668 # ext3 journal size is in megabytes
671 if not is_block(dev):
672 ret, out = runcmd("ls -l %s" %dev)
673 devsize = int(string.split(out[0])[4]) / 1024
675 # sfdisk works for symlink, hardlink, and realdev
676 ret, out = runcmd("sfdisk -s %s" %dev)
678 devsize = int(out[0])
680 # sfdisk -s will fail for too large block device,
681 # then, read the size of partition from /proc/partitions
683 # get the realpath of the device
684 # it may be the real device, such as /dev/hda7
685 # or the hardlink created via mknod for a device
686 real_dev = my_realpath(dev)
688 # get the major and minor number of the realpath via ls
689 # it seems python(os.stat) does not return
690 # the st_rdev member of the stat structure
691 ret, out = runcmd("ls -l %s" %real_dev)
692 major = string.split(string.split(out[0])[4], ",")[0]
693 minor = string.split(out[0])[5]
695 # get the devsize from /proc/partitions with the major and minor number
696 ret, out = runcmd("cat /proc/partitions")
699 if string.split(line)[0] == major and string.split(line)[1] == minor:
700 devsize = int(string.split(line)[2])
703 if devsize > 1024 * 1024:
704 jsize = ((devsize / 102400) * 4)
707 if jsize: jopt = "-J size=%d" %(jsize,)
708 if isize: iopt = "-I %d" %(isize,)
709 mkfs = 'mkfs.ext2 -j -b 4096 '
710 if not isblock or config.force:
712 elif fstype == 'reiserfs':
713 # reiserfs journal size is in blocks
714 if jsize: jopt = "--journal_size %d" %(jsize,)
715 mkfs = 'mkreiserfs -ff'
717 panic('unsupported fs type: ', fstype)
719 if config.mkfsoptions != None:
720 mkfs = mkfs + ' ' + config.mkfsoptions
721 if mkfsoptions != None:
722 mkfs = mkfs + ' ' + mkfsoptions
723 (ret, out) = run (mkfs, jopt, iopt, dev, block_cnt)
725 panic("Unable to build fs:", dev, string.join(out))
726 # enable hash tree indexing on fsswe
727 if fstype in ('ext3', 'ldiskfs'):
728 htree = 'tune2fs -O dir_index'
729 (ret, out) = run (htree, dev)
731 panic("Unable to enable htree:", dev)
733 # some systems use /dev/loopN, some /dev/loop/N
737 if not os.access(loop + str(0), os.R_OK):
739 if not os.access(loop + str(0), os.R_OK):
743 # find loop device assigned to the file
746 for n in xrange(0, MAX_LOOP_DEVICES):
748 if os.access(dev, os.R_OK):
749 (stat, out) = run('losetup', dev)
750 if out and stat == 0:
751 m = re.search(r'\((.*)\)', out[0])
752 if m and file == m.group(1):
758 # create file if necessary and assign the first free loop device
759 def init_loop(file, size, fstype, journal_size, inode_size, mkfsoptions, reformat):
760 dev = find_loop(file)
762 print 'WARNING file:', file, 'already mapped to', dev
764 if reformat or not os.access(file, os.R_OK | os.W_OK):
766 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
767 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
770 panic("Unable to create backing store:", file)
771 mkfs(file, size, fstype, journal_size, inode_size, mkfsoptions, isblock=0)
774 # find next free loop
775 for n in xrange(0, MAX_LOOP_DEVICES):
777 if os.access(dev, os.R_OK):
778 (stat, out) = run('losetup', dev)
780 (stat, out) = run('losetup', dev, file)
782 panic("losetup failed: (%s) %s" % (stat, out[0].strip()))
785 print "out of loop devices"
787 print "out of loop devices"
790 # undo loop assignment
791 def clean_loop(file):
792 dev = find_loop(file)
794 ret, out = run('losetup -d', dev)
796 log('unable to clean loop device:', dev, 'for file:', file)
799 # determine if dev is formatted as a <fstype> filesystem
800 def need_format(fstype, dev):
801 # FIXME don't know how to implement this
804 # initialize a block device if needed
805 def block_dev(dev, size, fstype, reformat, autoformat, journal_size,
806 inode_size, mkfsoptions):
807 if config.noexec: return dev
808 if not is_block(dev):
809 dev = init_loop(dev, size, fstype, journal_size, inode_size,
810 mkfsoptions, reformat)
811 elif reformat or (need_format(fstype, dev) and autoformat == 'yes'):
812 mkfs(dev, size, fstype, journal_size, inode_size, mkfsoptions,
815 # panic("device:", dev,
816 # "not prepared, and autoformat is not set.\n",
817 # "Rerun with --reformat option to format ALL filesystems")
821 """lookup IP address for an interface"""
822 rc, out = run("/sbin/ifconfig", iface)
825 addr = string.split(out[1])[1]
826 ip = string.split(addr, ':')[1]
829 def def_mount_options(fstype, target, blkdev):
830 """returns deafult mount options for passed fstype and target (mds, ost)"""
831 if fstype == 'ext3' or fstype == 'ldiskfs':
832 mountfsoptions = "errors=remount-ro"
834 if sys_get_branch() == '2.4':
835 mountfsoptions = "%s,asyncdel" % (mountfsoptions)
837 # mountfsoptions = "%s,extents,mballoc" % (mountfsoptions)
838 elif target == 'mds':
839 if config.user_xattr:
840 mountfsoptions = "%s,user_xattr" % (mountfsoptions)
842 mountfsoptions = "%s,acl" % (mountfsoptions)
845 # grab superblock info
846 dumpe2fs="dumpe2fs -f -h"
847 (ret, sb) = run(dumpe2fs, blkdev)
849 panic("unable to get superblock for ", blkdev)
851 # extract journal UUID
855 lst = string.split(line, ":")
856 if lst[0] == 'Journal UUID':
858 panic("cannot retrieve journal UUID for ", blkdev)
859 if string.split(lst[1])[0] != '<none>':
860 journal_UUID = string.split(lst[1])[0]
861 debug(blkdev, 'has journal UUID', journal_UUID)
862 if lst[0] == 'Journal device':
864 panic("cannot retrieve journal device for ", blkdev)
865 if string.split(lst[1])[0] != '0x0000':
866 journal_DEV = string.split(lst[1])[0]
867 debug(blkdev, 'has journal device', journal_DEV)
870 if len(journal_UUID) == 0 or len(journal_DEV) == 0:
871 debug('no external journal found for', blkdev)
872 # use internal journal
873 return mountfsoptions
875 # run blkid, lookup highest-priority device with matching UUID
876 blkid = "blkid -o device -l -t UUID='%s'" % (journal_UUID)
877 (ret, devname) = run(blkid)
878 if ret or len(devname) == 0:
879 panic("cannot find external journal for ", blkdev)
880 debug('found', blkdev, 'journal UUID', journal_UUID, 'on',
881 string.replace(devname[0], '\n', ''))
883 try: # sigh, python 1.5 does not support os.stat().st_rdev
884 jdevpath = my_realpath(string.replace(devname[0], '\n', ''))
885 ret, out = runcmd("ls -l %s" %jdevpath)
887 major = int(string.split(string.split(out[0])[4], ',')[0])
888 minor = int(string.split(out[0])[5])
889 debug('major', major, 'minor', minor)
890 rdev = major << 8 | minor
892 panic("cannot stat ", devname[0])
894 debug('found', blkdev, 'journal UUID', journal_UUID, 'on',
895 jdevpath, 'rdev', rdev)
898 if string.atoi(journal_DEV, 0) != rdev:
899 mountfsoptions = "%s,journal_dev=%#x" % (mountfsoptions,rdev)
901 return mountfsoptions
904 def sys_get_branch():
905 """Returns kernel release"""
906 return os.uname()[2][:3]
908 def mod_loaded(modname):
909 """Check if a module is already loaded. Look in /proc/modules for it."""
910 if PLATFORM == 'LINUX':
912 fp = open('/proc/modules')
913 lines = fp.readlines()
915 # please forgive my tired fingers for this one
916 ret = filter(lambda word, mod=modname: word == mod,
917 map(lambda line: string.split(line)[0], lines))
921 elif PLATFORM == 'DARWIN':
922 ret, out = run('/usr/sbin/kextstat | /usr/bin/grep', modname)
930 # XXX: instead of device_list, ask for $name and see what we get
931 def is_prepared(name):
932 """Return true if a device exists for the name"""
935 if (config.noexec or config.record) and config.cleanup:
938 # expect this format:
939 # 1 UP ldlm ldlm ldlm_UUID 2
940 out = lctl.device_list()
942 if name == string.split(s)[3]:
944 except CommandError, e:
948 def is_network_prepared():
949 """If the any device exists, then assume that all networking
950 has been configured"""
951 out = lctl.device_list()
954 def fs_is_mounted(path):
955 """Return true if path is a mounted lustre filesystem"""
957 real_path = my_realpath(path)
959 fp = open('/proc/mounts')
960 lines = fp.readlines()
964 if a[1] == real_path and a[2] == 'lustre_lite':
971 """Manage kernel modules"""
972 def __init__(self, lustre_dir, portals_dir):
973 self.lustre_dir = lustre_dir
974 self.portals_dir = portals_dir
975 self.kmodule_list = []
977 def add_portals_module(self, dev_dir, modname):
978 """Append a module to list of modules to load."""
979 self.kmodule_list.append((self.portals_dir, dev_dir, modname))
981 def add_lustre_module(self, dev_dir, modname):
982 """Append a module to list of modules to load."""
983 self.kmodule_list.append((self.lustre_dir, dev_dir, modname))
985 def load_module(self):
986 """Load all the modules in the list in the order they appear."""
987 for src_dir, dev_dir, mod in self.kmodule_list:
988 if mod_loaded(mod) and not config.noexec:
990 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
991 if PLATFORM == 'LINUX':
994 #For LNET we really need modprobe to load defined LNDs
995 run('/sbin/modprobe lnet')
996 #But if that fails, try insmod anyhow with dev option
997 #accept=all for dev liblustre testing
998 options = 'accept=all'
1000 module = find_module(src_dir, dev_dir, mod)
1002 panic('module not found:', mod)
1003 (rc, out) = run('/sbin/insmod', module, options)
1004 if rc and not mod_loaded(mod):
1006 print("Bad module options? Check dmesg.")
1007 raise CommandError('insmod', out, rc)
1009 (rc, out) = run('/sbin/modprobe', mod)
1010 if rc and not mod_loaded(mod):
1012 print("Bad module options? Check dmesg.")
1013 raise CommandError('modprobe', out, rc)
1014 elif PLATFORM == 'DARWIN':
1015 run('/sbin/kextload', KEXTPATH + mod + '.kext');
1017 def cleanup_module(self):
1018 """Unload the modules in the list in reverse order."""
1020 rev = self.kmodule_list[:] # make *copy* of list
1022 for src_dir, dev_dir, mod in rev:
1023 if not mod_loaded(mod) and not config.noexec:
1025 if mod == 'ksocklnd' and not config.noexec:
1026 # Ignore ksocklnd in module list (lnet will remove)
1028 log('unloading module:', mod)
1029 if mod == 'lnet' and not config.noexec:
1030 # remove any self-ref portals created
1031 lctl.unconfigure_network()
1033 debug('dumping debug log to', config.dump)
1035 lctl.dump(config.dump)
1036 log('unloading the network')
1037 lctl.unconfigure_network()
1038 if mod_loaded("ksocklnd"):
1039 if PLATFORM == 'LINUX':
1040 run('/sbin/rmmod ksocklnd')
1041 elif PLATFORM == 'DARWIN':
1042 run('/sbin/kextunload', KEXTPATH+'ksocklnd.kext')
1043 if mod_loaded("kqswlnd"):
1044 run('/sbin/rmmod kqswlnd')
1045 if mod_loaded("kgmlnd"):
1046 run('/sbin/rmmod kgmlnd')
1047 if mod_loaded("kopeniblnd"):
1048 run('/sbin/rmmod kopeniblnd')
1049 if mod_loaded("kiiblnd"):
1050 run('/sbin/rmmod kiiblnd')
1051 if mod_loaded("kviblnd"):
1052 run('/sbin/rmmod kviblnd')
1053 if mod_loaded("kciblnd"):
1054 run('/sbin/rmmod kciblnd')
1055 if mod_loaded("ko2iblnd"):
1056 run('/sbin/rmmod ko2iblnd')
1057 if mod_loaded("kralnd"):
1058 run('/sbin/rmmod kralnd')
1059 if mod_loaded("kptllnd"):
1060 run('/sbin/rmmod kptllnd')
1061 if PLATFORM == 'LINUX':
1062 (rc, out) = run('/sbin/rmmod', mod)
1063 elif PLATFORM == 'DARWIN':
1064 (rc, out) = run('/sbin/kextunload', KEXTPATH+mod+'.kext');
1066 log('! unable to unload module:', mod)
1070 # ============================================================
1071 # Classes to prepare and cleanup the various objects
1074 """ Base class for the rest of the modules. The default cleanup method is
1075 defined here, as well as some utilitiy funcs.
1077 def __init__(self, module_name, db):
1079 self.module_name = module_name
1080 self.name = self.db.getName()
1081 self.uuid = self.db.getUUID()
1084 self.kmod = kmod(config.lustre, config.portals)
1086 def info(self, *args):
1087 msg = string.join(map(str,args))
1088 log (self.module_name + ":", self.name, self.uuid, msg)
1091 """ default cleanup, used for most modules """
1094 lctl.cleanup(self.name, self.uuid, config.force)
1095 except CommandError, e:
1096 log(self.module_name, "cleanup failed: ", self.name)
1100 def add_portals_module(self, dev_dir, modname):
1101 """Append a module to list of modules to load."""
1102 self.kmod.add_portals_module(dev_dir, modname)
1104 def add_lustre_module(self, dev_dir, modname):
1105 """Append a module to list of modules to load."""
1106 self.kmod.add_lustre_module(dev_dir, modname)
1108 def load_module(self):
1109 """Load all the modules in the list in the order they appear."""
1110 self.kmod.load_module()
1112 def cleanup_module(self):
1113 """Unload the modules in the list in reverse order."""
1114 if self.safe_to_clean():
1115 self.kmod.cleanup_module()
1117 def safe_to_clean(self):
1120 def safe_to_clean_modules(self):
1121 return self.safe_to_clean()
1123 class Network(Module):
1124 def __init__(self,db,nid_uuid=0):
1125 Module.__init__(self, 'NETWORK', db)
1126 self.net_type = self.db.get_val('nettype')
1127 self.nid = self.db.get_val('nid', '*')
1128 self.cluster_id = self.db.get_val('clusterid', "0")
1129 self.port = self.db.get_val_int('port', 0)
1130 self.nid_uuid = nid_uuid
1131 self.add_portals_module('libcfs', 'libcfs')
1132 self.add_portals_module('lnet', 'lnet')
1133 # Add the socklnd for developers without modprobe.conf (umls)
1134 self.add_portals_module('klnds/socklnd', 'ksocklnd')
1137 if is_network_prepared():
1139 self.info(self.net_type, self.nid)
1140 if self.net_type == 'tcp':
1142 if self.net_type == 'elan':
1145 def safe_to_clean(self):
1146 if PLATFORM == 'LINUX':
1147 return not is_network_prepared()
1148 elif PLATFORM == 'DARWIN':
1149 # XXX always assume it's safe to clean
1154 self.info(self.net_type, self.nid)
1156 # This is only needed to load the modules; the LDLM device
1157 # is now created automatically.
1159 def __init__(self,db):
1160 Module.__init__(self, 'LDLM', db)
1161 self.add_lustre_module('lvfs', 'lvfs')
1162 self.add_lustre_module('obdclass', 'obdclass')
1163 self.add_lustre_module('ptlrpc', 'ptlrpc')
1164 self.add_lustre_module('ptlrpc/gss', 'ptlrpc_gss')
1173 def __init__(self, db, uuid, fs_name, name_override = None, config_only = None):
1174 Module.__init__(self, 'LOV', db)
1175 if name_override != None:
1176 self.name = "lov_%s" % name_override
1177 self.add_lustre_module('lov', 'lov')
1178 self.mds_uuid = self.db.get_first_ref('mds')
1179 self.stripe_sz = self.db.get_val_int('stripesize', 1048576)
1180 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1181 self.pattern = self.db.get_val_int('stripepattern', 0)
1183 self.stripe_cnt = self.db.get_val_int('stripecount', 1)
1185 self.desc_uuid = self.uuid
1186 self.uuid = generate_client_uuid(self.name)
1187 self.fs_name = fs_name
1188 # settings below here won't be seen by the MDSDEV code!
1190 self.config_only = 1
1192 self.config_only = None
1193 mds = self.db.lookup(self.mds_uuid)
1194 self.mds_name = mds.getName()
1195 self.devlist = self.db.get_lov_tgts('lov_tgt')
1196 for (obd_uuid, index, gen, active) in self.devlist:
1199 obd = self.db.lookup(obd_uuid)
1200 osc = get_osc(obd, self.uuid, fs_name)
1202 self.osclist.append((osc, index, gen, active))
1204 panic('osc not found:', obd_uuid)
1205 if self.osclist == []:
1206 debug("get_lov_tgts failed, using get_refs");
1208 self.devlist = self.db.get_refs('obd')
1209 for obd_uuid in self.devlist:
1210 obd = self.db.lookup(obd_uuid)
1211 osc = get_osc(obd, self.uuid, fs_name)
1213 self.osclist.append((osc, index, 1, 1))
1215 panic('osc not found:', obd_uuid)
1217 if self.osclist == []:
1218 panic('No OSCs configured for LOV')
1219 debug('dbg LOV __init__:', self.osclist, self.devlist, self.stripe_cnt)
1222 debug('dbg LOV prepare')
1223 if is_prepared(self.name):
1225 debug('dbg LOV prepare:', self.osclist, self.devlist)
1226 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1227 self.stripe_off, self.pattern, self.devlist,
1229 lctl.lov_setup(self.name, self.uuid,
1230 self.desc_uuid, self.mds_name, self.stripe_cnt,
1231 self.stripe_sz, self.stripe_off, self.pattern)
1232 if self.osclist == []:
1233 panic('No OSCs configured for LOV?')
1234 for (osc, index, gen, active) in self.osclist:
1235 target_uuid = osc.target_uuid
1237 # Only ignore connect failures with --force, which
1238 # isn't implemented here yet.
1240 osc.prepare(ignore_connect_failure=0)
1241 except CommandError, e:
1242 print "Error preparing OSC %s\n" % osc.uuid
1244 lctl.lov_add_obd(self.name, self.uuid, target_uuid, index, gen)
1247 if is_prepared(self.name):
1248 Module.cleanup(self)
1249 for (osc, index, gen, active) in self.osclist:
1251 if self.config_only:
1252 panic("Can't clean up config_only LOV ", self.name)
1254 def load_module(self):
1255 if self.config_only:
1256 panic("Can't load modules for config_only LOV ", self.name)
1257 for (osc, index, gen, active) in self.osclist:
1260 Module.load_module(self)
1262 def cleanup_module(self):
1263 if self.config_only:
1264 panic("Can't cleanup modules for config_only LOV ", self.name)
1265 Module.cleanup_module(self)
1266 for (osc, index, gen, active) in self.osclist:
1268 osc.cleanup_module()
1271 class MDSDEV(Module):
1272 def __init__(self,db):
1273 Module.__init__(self, 'MDSDEV', db)
1274 self.devpath = self.db.get_val('devpath','')
1275 self.size = self.db.get_val_int('devsize', 0)
1276 self.journal_size = self.db.get_val_int('journalsize', 0)
1278 self.fstype = self.db.get_val('fstype', '')
1279 if sys_get_branch() == '2.4' and self.fstype == 'ldiskfs':
1280 self.fstype = 'ext3'
1281 elif sys_get_branch() == '2.6' and self.fstype == 'ext3':
1282 self.fstype = 'ldiskfs'
1284 self.nspath = self.db.get_val('nspath', '')
1285 self.mkfsoptions = '-i 4096 ' + self.db.get_val('mkfsoptions', '')
1286 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1288 self.quota = config.quota
1290 self.quota = self.db.get_val('quota', '')
1291 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1292 target_uuid = self.db.get_first_ref('target')
1293 mds = self.db.lookup(target_uuid)
1294 self.name = mds.getName()
1295 self.filesystem_uuids = mds.get_refs('filesystem')
1296 # FIXME: if fstype not set, then determine based on kernel version
1297 self.format = self.db.get_val('autoformat', "no")
1298 if mds.get_val('failover', '1') != '0':
1299 self.failover_mds = 'f'
1301 self.failover_mds = 'n'
1302 active_uuid = get_active_target(mds)
1304 panic("No target device found:", target_uuid)
1305 if active_uuid == self.uuid:
1309 if self.active and config.group and config.group != mds.get_val('group', mds.get_val('name')):
1312 self.inode_size = self.db.get_val_int('inodesize', 0)
1313 debug('original inode_size ', self.inode_size)
1314 if self.inode_size == 0:
1315 # find the LOV for this MDS
1316 lovconfig_uuid = mds.get_first_ref('lovconfig')
1317 if not lovconfig_uuid:
1318 panic("No LOV config found for MDS ", mds.name)
1319 lovconfig = mds.lookup(lovconfig_uuid)
1320 lov_uuid = lovconfig.get_first_ref('lov')
1322 panic("No LOV found for lovconfig ", lovconfig.name)
1323 lov = LOV(self.db.lookup(lov_uuid), lov_uuid, 'FS_name', config_only = 1)
1325 # default stripe count controls default inode_size
1326 if (lov.stripe_cnt > 0):
1327 stripe_count = lov.stripe_cnt
1330 if stripe_count > 77:
1331 self.inode_size = 512
1332 elif stripe_count > 34:
1333 self.inode_size = 2048
1334 elif stripe_count > 13:
1335 self.inode_size = 1024
1336 #elif stripe_count < 3:
1337 # self.inode_size = 256
1339 self.inode_size = 512
1340 debug('stripe_count ', stripe_count,' inode_size ',self.inode_size)
1342 self.target_dev_uuid = self.uuid
1343 self.uuid = target_uuid
1347 self.add_lustre_module('quota', 'lquota')
1348 self.add_lustre_module('mdc', 'mdc')
1349 self.add_lustre_module('osc', 'osc')
1350 self.add_lustre_module('lov', 'lov')
1351 self.add_lustre_module('mds', 'mds')
1352 if self.fstype == 'ldiskfs':
1353 self.add_lustre_module('ldiskfs', 'ldiskfs')
1355 self.add_lustre_module('lvfs', 'fsfilt_%s' % (self.fstype))
1357 def load_module(self):
1359 Module.load_module(self)
1362 if is_prepared(self.name):
1365 debug(self.uuid, "not active")
1368 # run write_conf automatically, if --reformat used
1370 self.info(self.devpath, self.fstype, self.size, self.format)
1371 # never reformat here
1372 blkdev = block_dev(self.devpath, self.size, self.fstype, 0,
1373 self.format, self.journal_size, self.inode_size,
1375 if not is_prepared('MDT'):
1376 lctl.newdev("mdt", 'MDT', 'MDT_UUID', setup ="")
1378 mountfsoptions = def_mount_options(self.fstype, 'mds', blkdev)
1380 if config.mountfsoptions:
1382 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1384 mountfsoptions = config.mountfsoptions
1385 if self.mountfsoptions:
1386 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1388 if self.mountfsoptions:
1390 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1392 mountfsoptions = self.mountfsoptions
1394 print 'MDS mount options: ' + mountfsoptions
1396 lctl.newdev("mds", self.name, self.uuid,
1397 setup ="%s %s %s %s %s" %(blkdev, self.fstype, self.name,
1398 mountfsoptions, self.quota))
1399 self.group_upcall = self.db.get_val('group_upcall','')
1400 sys_set_group_upcall(self.name, self.group_upcall)
1402 except CommandError, e:
1404 panic("MDS failed to start. Check the syslog for details." +
1405 " (May need to run lconf --write-conf)")
1409 def write_conf(self):
1410 if is_prepared(self.name):
1412 self.info(self.devpath, self.fstype, self.format)
1413 blkdev = block_dev(self.devpath, self.size, self.fstype,
1414 config.reformat, self.format, self.journal_size,
1415 self.inode_size, self.mkfsoptions)
1416 lctl.newdev("mds", self.name, self.uuid,
1417 setup ="%s %s" %(blkdev, self.fstype))
1419 # record logs for the MDS lov
1420 for uuid in self.filesystem_uuids:
1421 log("recording clients for filesystem:", uuid)
1422 fs = self.db.lookup(uuid)
1423 obd_uuid = fs.get_first_ref('obd')
1424 client_uuid = generate_client_uuid(self.name)
1425 client = VOSC(self.db.lookup(obd_uuid), client_uuid, self.name,
1428 lctl.clear_log(self.name, self.name)
1429 lctl.record(self.name, self.name)
1431 lctl.mount_option(self.name, client.get_name(), "")
1435 # record logs for each client
1437 config_options = "--ldapurl " + config.ldapurl + " --config " + config.config
1439 config_options = CONFIG_FILE
1441 for node_db in self.db.lookup_class('node'):
1442 client_name = node_db.getName()
1443 for prof_uuid in node_db.get_refs('profile'):
1444 prof_db = node_db.lookup(prof_uuid)
1445 # refactor this into a funtion to test "clientness" of a node.
1446 for ref_class, ref_uuid in prof_db.get_all_refs():
1447 if ref_class in ('mountpoint','echoclient'):
1448 thing = self.db.lookup(ref_uuid);
1449 fs_uuid = thing.get_first_ref('filesystem')
1450 if not fs_uuid in self.filesystem_uuids:
1453 log("Recording log", client_name, "on", self.name)
1454 old_noexec = config.noexec
1456 noexec_opt = ('', '-n')
1457 ret, out = run (sys.argv[0],
1458 noexec_opt[old_noexec == 1],
1459 " -v --record --nomod --old_conf",
1460 "--record_log", client_name,
1461 "--record_device", self.name,
1462 "--node", client_name,
1465 lctl.clear_log(self.name, client_name)
1468 panic("Record client log %s on %s failed" %(
1469 client_name, self.name))
1471 for s in out: log("record> ", string.strip(s))
1472 config.noexec = old_noexec
1474 lctl.cleanup(self.name, self.uuid, config.force, config.failover)
1475 except CommandError, e:
1476 log(self.module_name, "cleanup failed: ", self.name)
1479 Module.cleanup(self)
1480 clean_loop(self.devpath)
1482 #change the mtime of LLOG to match the XML creation time
1483 if toplustreDB.get_mtime():
1484 mtime = toplustreDB.get_mtime()
1485 debug("changing mtime of LOGS to %s" %mtime)
1486 ret, mktemp = runcmd("mktemp /tmp/lustre-cmd.XXXXXXXX")
1488 log(self.module_name, "create mtime LOGS cmdfile failed: ", self.name)
1490 mtimecmdfile = string.split(mktemp[0])[0]
1491 fd = os.open(mtimecmdfile, os.O_RDWR | os.O_CREAT)
1492 os.write(fd, "\n\n\n\n\n%s\n\n" %mtime)
1494 cmd = "debugfs -w -R \"mi /LOGS\" <%s %s" %(mtimecmdfile, self.devpath)
1495 ret, outs = runcmd(cmd)
1496 os.remove(mtimecmdfile)
1498 print "Can not change mtime of LOGS by debugfs."
1500 def mds_remaining(self):
1501 out = lctl.device_list()
1503 if string.split(s)[2] in ('mds',):
1504 if string.split(s)[1] in ('ST',):
1508 def safe_to_clean(self):
1511 def safe_to_clean_modules(self):
1512 return not self.mds_remaining()
1516 debug(self.uuid, "not active")
1519 if is_prepared(self.name):
1521 lctl.cleanup(self.name, self.uuid, config.force,
1523 except CommandError, e:
1524 log(self.module_name, "cleanup failed: ", self.name)
1527 Module.cleanup(self)
1528 if not self.mds_remaining() and is_prepared('MDT'):
1530 lctl.cleanup("MDT", "MDT_UUID", config.force,
1532 except CommandError, e:
1533 print "cleanup failed: ", self.name
1536 clean_loop(self.devpath)
1539 def __init__(self, db):
1540 Module.__init__(self, 'OSD', db)
1541 self.osdtype = self.db.get_val('osdtype')
1542 self.devpath = self.db.get_val('devpath', '')
1543 self.size = self.db.get_val_int('devsize', 0)
1544 self.journal_size = self.db.get_val_int('journalsize', 0)
1546 # now as we store fids in EA on OST we need to make inode bigger
1547 self.inode_size = self.db.get_val_int('inodesize', 0)
1548 if self.inode_size == 0:
1549 self.inode_size = 256
1550 self.mkfsoptions = self.db.get_val('mkfsoptions', '')
1551 # Allocate fewer inodes on large OST devices. Most filesystems
1552 # can be much more aggressive than this, but by default we can't.
1553 if self.size > 1000000:
1554 self.mkfsoptions = '-i 16384 ' + self.mkfsoptions
1555 self.mountfsoptions = self.db.get_val('mountfsoptions', '')
1557 self.quota = config.quota
1559 self.quota = self.db.get_val('quota', '')
1561 self.fstype = self.db.get_val('fstype', '')
1562 if sys_get_branch() == '2.4' and self.fstype == 'ldiskfs':
1563 self.fstype = 'ext3'
1564 elif sys_get_branch() == '2.6' and self.fstype == 'ext3':
1565 self.fstype = 'ldiskfs'
1567 self.nspath = self.db.get_val('nspath', '')
1568 target_uuid = self.db.get_first_ref('target')
1569 ost = self.db.lookup(target_uuid)
1570 self.name = ost.getName()
1571 self.format = self.db.get_val('autoformat', 'yes')
1572 if ost.get_val('failover', '1') != '0':
1573 self.failover_ost = 'f'
1575 self.failover_ost = 'n'
1577 active_uuid = get_active_target(ost)
1579 panic("No target device found:", target_uuid)
1580 if active_uuid == self.uuid:
1584 if self.active and config.group and config.group != ost.get_val('group', ost.get_val('name')):
1587 self.target_dev_uuid = self.uuid
1588 self.uuid = target_uuid
1591 self.add_lustre_module('quota', 'lquota')
1592 self.add_lustre_module('ost', 'ost')
1593 # FIXME: should we default to ext3 here?
1594 if self.fstype == 'ldiskfs':
1595 self.add_lustre_module('ldiskfs', 'ldiskfs')
1597 self.add_lustre_module('lvfs' , 'fsfilt_%s' % (self.fstype))
1598 self.add_lustre_module(self.osdtype, self.osdtype)
1600 def load_module(self):
1602 Module.load_module(self)
1604 # need to check /proc/mounts and /etc/mtab before
1605 # formatting anything.
1606 # FIXME: check if device is already formatted.
1608 if is_prepared(self.name):
1611 debug(self.uuid, "not active")
1613 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1614 self.format, self.journal_size, self.inode_size)
1615 if self.osdtype == 'obdecho':
1618 blkdev = block_dev(self.devpath, self.size, self.fstype,
1619 config.reformat, self.format, self.journal_size,
1620 self.inode_size, self.mkfsoptions)
1622 mountfsoptions = def_mount_options(self.fstype, 'ost', blkdev)
1624 if config.mountfsoptions:
1626 mountfsoptions = mountfsoptions + ',' + config.mountfsoptions
1628 mountfsoptions = config.mountfsoptions
1629 if self.mountfsoptions:
1630 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1632 if self.mountfsoptions:
1634 mountfsoptions = mountfsoptions + ',' + self.mountfsoptions
1636 mountfsoptions = self.mountfsoptions
1638 print 'OST mount options: ' + mountfsoptions
1640 lctl.newdev(self.osdtype, self.name, self.uuid,
1641 setup ="%s %s %s %s %s" %(blkdev, self.fstype,
1642 self.failover_ost, mountfsoptions,
1644 if not is_prepared('OSS'):
1645 lctl.newdev("ost", 'OSS', 'OSS_UUID', setup ="")
1647 def osd_remaining(self):
1648 out = lctl.device_list()
1650 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1653 def safe_to_clean(self):
1656 def safe_to_clean_modules(self):
1657 return not self.osd_remaining()
1661 debug(self.uuid, "not active")
1663 if is_prepared(self.name):
1666 lctl.cleanup(self.name, self.uuid, config.force,
1668 except CommandError, e:
1669 log(self.module_name, "cleanup failed: ", self.name)
1672 if not self.osd_remaining() and is_prepared('OSS'):
1674 lctl.cleanup("OSS", "OSS_UUID", config.force,
1676 except CommandError, e:
1677 print "cleanup failed: ", self.name
1680 if not self.osdtype == 'obdecho':
1681 clean_loop(self.devpath)
1683 # Generic client module, used by OSC and MDC
1684 class Client(Module):
1685 def __init__(self, tgtdb, uuid, module, fs_name, self_name=None,
1687 self.target_name = tgtdb.getName()
1688 self.target_uuid = tgtdb.getUUID()
1690 self.backup_targets = []
1692 self.tgt_dev_uuid = get_active_target(tgtdb)
1693 if not self.tgt_dev_uuid:
1694 panic("No target device found for target:", self.target_name)
1696 self.kmod = kmod(config.lustre, config.portals)
1700 self.module = module
1701 self.module_name = string.upper(module)
1703 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1704 self.target_name, fs_name)
1706 self.name = self_name
1708 self.lookup_server(self.tgt_dev_uuid)
1709 self.lookup_backup_targets()
1710 self.fs_name = fs_name
1713 self.add_lustre_module(module_dir, module)
1715 def lookup_server(self, srv_uuid):
1716 """ Lookup a server's network information """
1717 self._server_nets = get_ost_net(self.db, srv_uuid)
1718 if len(self._server_nets) == 0:
1719 panic("Unable to find a server for:", srv_uuid)
1721 def get_servers(self):
1722 return self._server_nets
1724 def lookup_backup_targets(self):
1725 """ Lookup alternative network information """
1726 prof_list = toplustreDB.get_refs('profile')
1727 for prof_uuid in prof_list:
1728 prof_db = toplustreDB.lookup(prof_uuid)
1730 panic("profile:", prof_uuid, "not found.")
1731 for ref_class, ref_uuid in prof_db.get_all_refs():
1732 if ref_class in ('osd', 'mdsdev'):
1733 devdb = toplustreDB.lookup(ref_uuid)
1734 uuid = devdb.get_first_ref('target')
1735 if self.target_uuid == uuid and self.tgt_dev_uuid != ref_uuid:
1736 debug("add backup target", ref_uuid)
1737 self.backup_targets.append(ref_uuid)
1739 def prepare(self, ignore_connect_failure = 0):
1740 self.info(self.target_uuid)
1741 if is_prepared(self.name):
1744 srv_list = self.get_servers()
1745 debug('dbg CLIENT __prepare__:', self.target_uuid, srv_list)
1746 for srv in srv_list:
1748 if len(srv_list) == 0:
1749 panic("no servers for ", self.target_uuid)
1750 except CommandError, e:
1751 if not ignore_connect_failure:
1756 if self.target_uuid in config.inactive and self.permits_inactive():
1757 debug("%s inactive" % self.target_uuid)
1758 inactive_p = "inactive"
1760 debug("%s active" % self.target_uuid)
1762 lctl.newdev(self.module, self.name, self.uuid,
1763 setup ="%s %s %s" % (self.target_uuid, srv.nid_uuid,
1766 panic("Unable to create OSC for ", self.target_uuid)
1768 for tgt_dev_uuid in self.backup_targets:
1769 this_nets = get_ost_net(toplustreDB, tgt_dev_uuid)
1770 if len(this_nets) == 0:
1771 panic ("Unable to find a backup server for:", tgt_dev_uuid)
1773 for srv in this_nets:
1776 lctl.add_conn(self.name, srv.nid_uuid);
1780 if is_prepared(self.name):
1781 Module.cleanup(self)
1782 srv_list = self.get_servers()
1783 for srv in srv_list:
1784 lctl.disconnect(srv)
1785 for tgt_dev_uuid in self.backup_targets:
1786 this_nets = get_ost_net(toplustreDB, tgt_dev_uuid)
1787 if len(this_nets) == 0:
1788 panic ("Unable to find a backup server for:", tgt_dev_uuid)
1790 for srv in this_nets:
1791 lctl.disconnect(srv)
1794 def __init__(self, db, uuid, fs_name):
1795 Client.__init__(self, db, uuid, 'mdc', fs_name)
1797 def permits_inactive(self):
1801 def __init__(self, db, uuid, fs_name):
1802 Client.__init__(self, db, uuid, 'osc', fs_name)
1804 def permits_inactive(self):
1808 def __init__(self, db):
1809 Module.__init__(self, 'COBD', db)
1810 self.real_uuid = self.db.get_first_ref('realobd')
1811 self.cache_uuid = self.db.get_first_ref('cacheobd')
1812 self.add_lustre_module('cobd' , 'cobd')
1814 # need to check /proc/mounts and /etc/mtab before
1815 # formatting anything.
1816 # FIXME: check if device is already formatted.
1818 if is_prepared(self.name):
1820 self.info(self.real_uuid, self.cache_uuid)
1821 lctl.newdev("cobd", self.name, self.uuid,
1822 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1825 # virtual interface for OSC and LOV
1827 def __init__(self, db, uuid, fs_name, name_override = None, quota = None):
1828 Module.__init__(self, 'VOSC', db)
1830 self.add_lustre_module('quota', 'lquota')
1831 if db.get_class() == 'lov':
1832 self.osc = LOV(db, uuid, fs_name, name_override)
1834 self.osc = get_osc(db, uuid, fs_name)
1836 return self.osc.uuid
1838 return self.osc.name
1843 def load_module(self):
1844 Module.load_module(self)
1845 self.osc.load_module()
1846 def cleanup_module(self):
1847 self.osc.cleanup_module()
1848 Module.cleanup_module(self)
1851 class ECHO_CLIENT(Module):
1852 def __init__(self,db):
1853 Module.__init__(self, 'ECHO_CLIENT', db)
1854 self.add_lustre_module('obdecho', 'obdecho')
1855 self.obd_uuid = self.db.get_first_ref('obd')
1856 obd = self.db.lookup(self.obd_uuid)
1857 self.uuid = generate_client_uuid(self.name)
1858 self.osc = VOSC(obd, self.uuid, self.name)
1861 if is_prepared(self.name):
1863 self.osc.prepare() # XXX This is so cheating. -p
1864 self.info(self.obd_uuid)
1866 lctl.newdev("echo_client", self.name, self.uuid,
1867 setup = self.osc.get_name())
1870 if is_prepared(self.name):
1871 Module.cleanup(self)
1874 def load_module(self):
1875 self.osc.load_module()
1876 Module.load_module(self)
1878 def cleanup_module(self):
1879 Module.cleanup_module(self)
1880 self.osc.cleanup_module()
1883 def generate_client_uuid(name):
1884 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1886 int(random.random() * 1048576),
1887 int(random.random() * 1048576))
1888 return client_uuid[:36]
1891 def my_rstrip(s, chars):
1892 """my_rstrip(s, chars) -> strips any instances of the characters
1893 found in chars from the right side of string s"""
1894 # XXX required because python versions pre 2.2.3 don't allow
1895 #string.rstrip() to take alternate char lists
1899 ns = string.rstrip(s, '/')
1900 except TypeError, e:
1901 for i in range(len(s) - 1, 0, -1):
1910 class Mountpoint(Module):
1911 def __init__(self,db):
1912 Module.__init__(self, 'MTPT', db)
1913 self.path = my_rstrip(self.db.get_val('path'), '/')
1914 self.clientoptions = self.db.get_val('clientoptions', '')
1915 self.fs_uuid = self.db.get_first_ref('filesystem')
1916 fs = self.db.lookup(self.fs_uuid)
1917 self.mds_uuid = fs.get_first_ref('mds')
1918 mds_db = self.db.lookup(self.mds_uuid)
1920 quota = config.quota
1922 quota = mds_db.get_val('quota', config.quota)
1923 self.obd_uuid = fs.get_first_ref('obd')
1924 obd = self.db.lookup(self.obd_uuid)
1925 client_uuid = generate_client_uuid(self.name)
1926 self.vosc = VOSC(obd, client_uuid, self.name, quota=quota)
1927 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1929 self.add_lustre_module('mdc', 'mdc')
1930 self.add_lustre_module('llite', 'llite')
1933 if fs_is_mounted(self.path):
1934 log(self.path, "already mounted.")
1938 mdc_name = self.mdc.name
1940 self.info(self.path, self.mds_uuid, self.obd_uuid)
1941 if config.record or config.lctl_dump:
1942 lctl.mount_option(local_node_name, self.vosc.get_name(), mdc_name)
1945 if config.clientoptions:
1946 if self.clientoptions:
1947 self.clientoptions = self.clientoptions + ',' + config.clientoptions
1949 self.clientoptions = config.clientoptions
1950 if self.clientoptions:
1951 self.clientoptions = ',' + self.clientoptions
1952 # Linux kernel will deal with async and not pass it to ll_fill_super,
1953 # so replace it with Lustre async
1954 self.clientoptions = string.replace(self.clientoptions, "async", "lasync")
1956 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s%s %s %s" % \
1957 (self.vosc.get_name(), mdc_name, self.clientoptions, config.config, self.path)
1958 run("mkdir", self.path)
1963 panic("mount failed:", self.path, ":", string.join(val))
1966 self.info(self.path, self.mds_uuid,self.obd_uuid)
1968 if config.record or config.lctl_dump:
1969 lctl.del_mount_option(local_node_name)
1971 if fs_is_mounted(self.path):
1973 (rc, out) = run("umount", "-f", self.path)
1975 (rc, out) = run("umount", self.path)
1977 raise CommandError('umount', out, rc)
1979 if fs_is_mounted(self.path):
1980 panic("fs is still mounted:", self.path)
1985 def load_module(self):
1986 self.vosc.load_module()
1987 Module.load_module(self)
1989 def cleanup_module(self):
1990 Module.cleanup_module(self)
1991 self.vosc.cleanup_module()
1994 # ============================================================
1995 # misc query functions
1997 def get_ost_net(self, osd_uuid):
2001 osd = self.lookup(osd_uuid)
2002 node_uuid = osd.get_first_ref('node')
2003 node = self.lookup(node_uuid)
2005 panic("unable to find node for osd_uuid:", osd_uuid,
2006 " node_ref:", node_uuid)
2007 for net_uuid in node.get_networks():
2008 db = node.lookup(net_uuid)
2009 net = Network(db, node_uuid)
2010 srv_list.append(net)
2014 # the order of iniitailization is based on level.
2015 def getServiceLevel(self):
2016 type = self.get_class()
2018 if type in ('network',):
2020 elif type in ('ldlm',):
2022 elif type in ('osd', 'cobd'):
2024 elif type in ('mdsdev',):
2026 elif type in ('mountpoint', 'echoclient'):
2029 panic("Unknown type: ", type)
2031 if ret < config.minlevel or ret > config.maxlevel:
2036 # return list of services in a profile. list is a list of tuples
2037 # [(level, db_object),]
2038 def getServices(self):
2040 for ref_class, ref_uuid in self.get_all_refs():
2041 servdb = self.lookup(ref_uuid)
2043 level = getServiceLevel(servdb)
2045 list.append((level, servdb))
2047 panic('service not found: ' + ref_uuid)
2053 ############################################################
2055 # FIXME: clean this mess up!
2057 # OSC is no longer in the xml, so we have to fake it.
2058 # this is getting ugly and begging for another refactoring
2059 def get_osc(ost_db, uuid, fs_name):
2060 osc = OSC(ost_db, uuid, fs_name)
2063 def get_mdc(db, uuid, fs_name, mds_uuid):
2064 mds_db = db.lookup(mds_uuid);
2066 panic("no mds:", mds_uuid)
2067 mdc = MDC(mds_db, uuid, fs_name)
2070 def get_active_target(db):
2071 target_uuid = db.getUUID()
2072 target_name = db.getName()
2073 node_name = get_select(target_name)
2075 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
2077 tgt_dev_uuid = db.get_first_ref('active')
2080 def get_server_by_nid_uuid(db, nid_uuid):
2081 for n in db.lookup_class("network"):
2083 if net.nid_uuid == nid_uuid:
2087 ############################################################
2091 type = db.get_class()
2092 debug('Service:', type, db.getName(), db.getUUID())
2097 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
2098 elif type == 'network':
2102 elif type == 'cobd':
2104 elif type == 'mdsdev':
2106 elif type == 'mountpoint':
2108 elif type == 'echoclient':
2111 panic("unknown service type:", type)
2115 # Prepare the system to run lustre using a particular profile
2116 # in a the configuration.
2117 # * load & the modules
2118 # * setup networking for the current node
2119 # * make sure partitions are in place and prepared
2120 # * initialize devices with lctl
2121 # Levels is important, and needs to be enforced.
2122 def for_each_profile(db, prof_list, operation):
2123 for prof_uuid in prof_list:
2124 prof_db = db.lookup(prof_uuid)
2126 panic("profile:", prof_uuid, "not found.")
2127 services = getServices(prof_db)
2130 def doWriteconf(services):
2135 if s[1].get_class() == 'mdsdev':
2136 n = newService(s[1])
2140 panic("Cannot find mds device, please run --write_conf on the mds node.")
2143 def doSetup(services):
2147 n = newService(s[1])
2150 def doModules(services):
2154 n = newService(s[1])
2157 def doCleanup(services):
2162 n = newService(s[1])
2163 if n.safe_to_clean():
2166 def doUnloadModules(services):
2171 n = newService(s[1])
2172 if n.safe_to_clean_modules():
2175 def doMakeServiceScript(services):
2179 os.makedirs(config.service_scripts)
2181 if e[0] != errno.EEXIST:
2182 panic("Couldn't create scripts dir " + config.service_scripts + ": " + e[1])
2185 if s[1].get_class() != 'osd' and s[1].get_class() != 'mdsdev':
2188 target_uuid = s[1].get_first_ref('target')
2189 target = toplustreDB.lookup(target_uuid)
2190 target_symlink = config.service_scripts + "/" + target.getName()
2194 os.unlink(target_symlink)
2196 print "Removed " + target_symlink
2198 if e[0] != errno.EISDIR:
2200 os.rmdir(target_symlink)
2202 print "Removed " + target_symlink
2204 if e[0] != errno.ENOENT:
2205 panic("Error removing " + target_symlink + ": " + e[1])
2208 os.symlink("/etc/init.d/lustre", target_symlink)
2210 print "Created service link " + target_symlink + " to /etc/init.d/lustre"
2213 if e[0] == errno.EEXIST:
2214 extra_error = " (use --force option to remove existing files)"
2217 panic("Error creating " + target_symlink + ": " + e[1] + extra_error)
2219 # Check mtime of config logs
2220 def doCheckMtime(lustreDB, hosts):
2222 node_db = lustreDB.lookup_name(h, 'node')
2229 prof_list = node_db.get_refs('profile')
2230 for prof_uuid in prof_list:
2231 prof_db = node_db.lookup(prof_uuid)
2233 services = getServices(prof_db)
2235 if s[1].get_class() == 'mdsdev':
2239 if mdsdb and lustreDB.get_mtime():
2240 debug("Checking XML modification time")
2241 devpath = mdsdb.get_val('devpath','')
2242 xmtime = string.atol(lustreDB.get_mtime())
2243 cmd = "debugfs -c -R 'stat /LOGS' %s 2>&1 | grep mtime" %devpath
2244 ret, kmtimes = runcmd(cmd)
2246 log("Can not get mtime info of MDS LOGS directory")
2248 kmtime = string.atoi(string.split(kmtimes[0])[1], 0)
2250 debug('xmtime ', xmtime, '> kmtime', kmtime)
2252 log("Warning: MDS startup logs are older than config %s."
2253 " Please run --write_conf on stopped MDS to update."
2256 panic("Error: MDS startup logs are older than config %s."
2257 " Please run --write_conf on stopped MDS to update."
2258 " Use '--old_conf' to start anyways." %CONFIG_FILE)
2263 def doHost(lustreDB, hosts):
2264 global local_node_name, tgt_select
2267 node_db = lustreDB.lookup_name(h, 'node')
2270 tgt_select[config.service] = h
2271 config.group = config.service
2274 panic('No host entry found.')
2276 local_node_name = node_db.get_val('name', 0)
2277 lustre_upcall = node_db.get_val('lustreUpcall', '')
2278 portals_upcall = node_db.get_val('portalsUpcall', '')
2279 timeout = node_db.get_val_int('timeout', 0)
2280 ptldebug = node_db.get_val('ptldebug', '')
2281 subsystem = node_db.get_val('subsystem', '')
2283 # Two step process: (1) load modules, (2) setup lustre
2284 # if not cleaning, load modules first.
2285 prof_list = node_db.get_refs('profile')
2287 if config.make_service_scripts:
2288 for_each_profile(node_db, prof_list, doMakeServiceScript)
2291 elif config.write_conf:
2292 for_each_profile(node_db, prof_list, doModules)
2293 for_each_profile(node_db, prof_list, doWriteconf)
2294 for_each_profile(node_db, prof_list, doUnloadModules)
2297 elif config.recover:
2298 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
2299 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
2300 "--client_uuid <UUID> --conn_uuid <UUID>")
2301 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
2303 elif config.cleanup:
2304 if not mod_loaded('lnet'):
2307 # ugly hack, only need to run lctl commands for --dump
2308 if config.lctl_dump or config.record:
2309 for_each_profile(node_db, prof_list, doCleanup)
2312 sys_set_ptldebug(ptldebug)
2313 sys_set_subsystem(subsystem)
2314 sys_set_lustre_upcall(lustre_upcall)
2315 sys_set_portals_upcall(portals_upcall)
2317 for_each_profile(node_db, prof_list, doCleanup)
2318 for_each_profile(node_db, prof_list, doUnloadModules)
2322 # ugly hack, only need to run lctl commands for --dump
2323 if config.lctl_dump or config.record:
2324 sys_set_timeout(timeout)
2325 sys_set_lustre_upcall(lustre_upcall)
2326 for_each_profile(node_db, prof_list, doSetup)
2329 if PLATFORM == 'LINUX':
2330 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
2331 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
2333 for_each_profile(node_db, prof_list, doModules)
2335 if PLATFORM == 'LINUX':
2336 # XXX need to be fixed for Darwin
2337 sys_set_debug_path()
2338 sys_set_ptldebug(ptldebug)
2339 sys_set_subsystem(subsystem)
2340 script = config.gdb_script
2341 run(lctl.lctl, ' modules >', script)
2343 log ("The GDB module script is in", script)
2344 # pause, so user has time to break and
2347 sys_set_timeout(timeout)
2348 sys_set_lustre_upcall(lustre_upcall)
2349 sys_set_portals_upcall(portals_upcall)
2351 for_each_profile(node_db, prof_list, doSetup)
2354 def add_clumanager_node(node_db, nodes, services):
2356 node_name = node_db.getUUID()
2357 nodes[node_name] = []
2359 for prof_uuid in node_db.get_refs('profile'):
2360 prof_db = toplustreDB.lookup(prof_uuid)
2361 for ref_class, ref_uuid in prof_db.get_all_refs():
2362 if ref_class not in ('osd', 'mdsdev'):
2364 devdb = toplustreDB.lookup(ref_uuid)
2365 tgt_uuid = devdb.get_first_ref('target')
2367 nodes[node_name].append(ref_uuid)
2369 if not services.has_key(tgt_uuid):
2371 print "New service: " + tgt_uuid + " (originally found on " + node_name + ")"
2372 new_services.append(tgt_uuid)
2373 services[tgt_uuid] = []
2374 services[tgt_uuid].append(ref_uuid)
2378 def add_clumanager_services(new_services, nodes, dev_list):
2380 for devdb in dev_list:
2381 tgt_uuid = devdb.get_first_ref('target')
2382 if tgt_uuid in new_services:
2383 node_uuid = devdb.get_first_ref('node')
2385 if not (nodes.has_key(node_uuid) or node_uuid in new_nodes):
2387 print "New node: " + node_uuid + " for service " + tgt_uuid
2388 new_nodes.append(node_uuid)
2392 def doClumanager(lustreDB, hosts):
2398 for dev_uuid in toplustreDB.get_refs('osd') + toplustreDB.get_refs('mdsdev'):
2399 dev_list.append(lustreDB.lookup(dev_uuid))
2403 node_db = lustreDB.lookup_name(h, 'node')
2406 new_services = add_clumanager_node(node_db, nodes, services)
2410 panic('No host entry found.')
2413 if len(new_services) == 0:
2416 new_nodes = add_clumanager_services(new_services, nodes, dev_list)
2417 if len(new_nodes) == 0:
2420 if len(new_nodes) + len(nodes.keys()) > 8:
2421 panic("CluManager only supports 8 nodes per failover \"cluster.\"")
2424 for node_uuid in new_nodes:
2425 node_db = lustreDB.lookup(node_uuid)
2427 panic("No node entry for " + node_uuid + " was found.")
2429 new_services.append(add_clumanager_node(node_db, nodes, services))
2432 for node in nodes.keys():
2433 nodedb = lustreDB.lookup(node)
2434 nodenames.append(nodedb.getName())
2437 print """<?xml version="1.0"?>
2438 <cluconfig version="3.0">
2439 <clumembd broadcast="no" interval="750000" loglevel="5" multicast="yes" multicast_ipaddress="225.0.0.11" thread="yes" tko_count="20"/>
2440 <cluquorumd loglevel="5" pinginterval="2"/>
2441 <clurmtabd loglevel="5" pollinterval="4"/>
2442 <clusvcmgrd loglevel="5"/>
2443 <clulockd loglevel="5"/>
2444 <cluster config_viewnumber="1" name="%s"/>
2445 <sharedstate driver="libsharedraw.so" rawprimary="%s" rawshadow="%s" type="raw"/>
2446 <members> """ % (string.join(nodenames), config.rawprimary, config.rawsecondary)
2450 for node in nodenames:
2451 print " <member id=\"%d\" name=\"%s\" watchdog=\"yes\"/>" % (i, node)
2454 print " </members>\n <failoverdomains>"
2456 servicekeys = services.keys()
2460 for service in servicekeys:
2461 svcdb = lustreDB.lookup(service)
2462 print " <failoverdomain id=\"%d\" name=\"%s\" ordered=\"yes\" restricted=\"yes\">" % (i, svcdb.getName())
2466 active_uuid = get_active_target(svcdb)
2467 for svc_uuid in [active_uuid] + services[service]:
2468 if svc_uuid == active_uuid and j > 0:
2470 svcdb = lustreDB.lookup(svc_uuid)
2472 svc_node_uuid = svcdb.get_first_ref('node')
2473 svc_nodedb = lustreDB.lookup(svc_node_uuid)
2475 print " <failoverdomainnode id=\"%d\" name=\"%s\"/>" % (j, svc_nodedb.getName())
2478 print " </failoverdomain>"
2480 print " </failoverdomains>\n <services>"
2483 for service in servicekeys:
2484 svcdb = lustreDB.lookup(service)
2485 active_uuid = get_active_target(svcdb)
2486 activedb = lustreDB.lookup(active_uuid)
2488 svc_node_uuid = activedb.get_first_ref('node')
2489 svc_nodedb = lustreDB.lookup(svc_node_uuid)
2491 print " <service checkinterval=\"30\" failoverdomain=\"%s\" id=\"%d\" name=\"%s\" userscript=\"%s/%s\">" \
2492 % ( svcdb.getName(), i, svcdb.getName(), config.service_scripts, svcdb.getName())
2493 print " <service_ipaddresses/>\n </service>"
2496 print " </services>\n</cluconfig>"
2498 def doRecovery(lustreDB, lctl, tgt_uuid, client_uuid, nid_uuid):
2499 tgt = lustreDB.lookup(tgt_uuid)
2501 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
2502 new_uuid = get_active_target(tgt)
2504 raise Lustre.LconfError("doRecovery: no active target found for: " +
2506 srv_list = find_local_servers(get_ost_net(lustreDB, new_uuid))
2508 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
2510 oldsrv = get_server_by_nid_uuid(lustreDB, nid_uuid)
2513 for srv in srv_list:
2514 if oldsrv.net_type != srv.net_type:
2517 log("Reconnecting", tgt_uuid, "to", srv.nid_uuid)
2519 lctl.recover(client_uuid, srv.nid_uuid)
2522 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
2523 base = os.path.dirname(cmd)
2524 if development_mode():
2525 if not config.lustre:
2526 debug('using objdir module paths')
2527 config.lustre = (os.path.join(base, ".."))
2528 # normalize the portals dir, using command line arg if set
2530 portals_dir = config.portals
2531 dir = os.path.join(config.lustre, portals_dir)
2532 config.portals = dir
2533 debug('config.portals', config.portals)
2534 elif config.lustre and config.portals:
2536 # if --lustre and --portals, normalize portals
2537 # can ignore POTRALS_DIR here, since it is probly useless here
2538 config.portals = os.path.join(config.lustre, config.portals)
2539 debug('config.portals B', config.portals)
2541 def sysctl(path, val):
2542 debug("+ sysctl", path, val)
2546 fp = open(os.path.join('/proc/sys', path), 'w')
2553 def sys_set_debug_path():
2554 sysctl('lnet/debug_path', config.debug_path)
2556 def validate_upcall(upcall):
2558 if upcall in ('DEFAULT','NONE'):
2560 elif os.path.exists(upcall):
2561 if not os.access(upcall, os.X_OK):
2562 print "WARNING upcall script not executable: %s" % upcall
2564 print "WARNING invalid upcall script specified: %s" % upcall
2566 def sys_set_lustre_upcall(upcall):
2567 # the command line overrides the value in the node config
2568 if config.lustre_upcall:
2569 upcall = config.lustre_upcall
2571 upcall = config.upcall
2573 validate_upcall(upcall)
2574 lctl.set_lustre_upcall(upcall)
2576 def sys_set_portals_upcall(upcall):
2577 # the command line overrides the value in the node config
2578 if config.portals_upcall:
2579 upcall = config.portals_upcall
2581 upcall = config.upcall
2583 validate_upcall(upcall)
2584 sysctl('lnet/upcall', upcall)
2586 def sys_set_group_upcall(mds, upcall):
2589 # the command line overrides the value in the MDS config
2590 if config.group_upcall:
2591 upcall = config.group_upcall
2593 validate_upcall(upcall)
2594 debug("setting MDS", mds, "upcall to:", upcall)
2595 path = "/proc/fs/lustre/mds/" + mds + "/group_upcall"
2596 fp = open(path, 'w')
2600 def sys_set_timeout(timeout):
2601 # the command overrides the value in the node config
2602 if config.timeout and config.timeout > 0:
2603 timeout = config.timeout
2604 if timeout != None and timeout > 0:
2605 lctl.set_timeout(timeout)
2607 def sys_tweak_socknal ():
2608 if config.single_socket:
2609 sysctl("socknal/typed", 0)
2611 def sys_optimize_elan ():
2612 procfiles = ["/proc/elan/config/eventint_punt_loops",
2613 "/proc/qsnet/elan3/config/eventint_punt_loops",
2614 "/proc/qsnet/elan4/config/elan4_mainint_punt_loops"]
2616 if os.access(p, os.W_OK):
2617 run ("echo 1 > " + p)
2619 def sys_set_ptldebug(ptldebug):
2621 ptldebug = config.ptldebug
2624 val = eval(ptldebug, ptldebug_names)
2625 val = "0x%x" % (val)
2626 sysctl('lnet/debug', val)
2627 except NameError, e:
2630 def sys_set_subsystem(subsystem):
2631 if config.subsystem:
2632 subsystem = config.subsystem
2635 val = eval(subsystem, subsystem_names)
2636 val = "0x%x" % (val)
2637 sysctl('lnet/subsystem_debug', val)
2638 except NameError, e:
2641 def sys_set_netmem_max(path, max):
2642 debug("setting", path, "to at least", max)
2650 fp = open(path, 'w')
2651 fp.write('%d\n' %(max))
2655 # Add dir to the global PATH, if not already there.
2656 def add_to_path(new_dir):
2657 syspath = string.split(os.environ['PATH'], ':')
2658 if new_dir in syspath:
2660 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2662 def default_debug_path():
2663 path = '/tmp/lustre-log'
2664 if os.path.isdir('/r'):
2669 def default_gdb_script():
2670 script = '/tmp/ogdb'
2671 if os.path.isdir('/r'):
2672 return '/r' + script
2676 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2677 # ensure basic elements are in the system path
2678 def sanitise_path():
2679 for dir in DEFAULT_PATH:
2682 # global hack for the --select handling
2684 def init_select(args):
2685 # args = [service=nodeA,service2=nodeB service3=nodeC]
2686 # --service <service> is analagous to:
2687 # --group <service> --select <service>=<node>
2688 # this is handled in doHost()
2691 list = string.split(arg, ',')
2693 srv, node = string.split(entry, '=')
2694 tgt_select[srv] = node
2696 def get_select(srv):
2697 if tgt_select.has_key(srv):
2698 return tgt_select[srv]
2702 FLAG = Lustre.Options.FLAG
2703 PARAM = Lustre.Options.PARAM
2704 INTPARAM = Lustre.Options.INTPARAM
2705 PARAMLIST = Lustre.Options.PARAMLIST
2707 ('verbose,v', "Print system commands as they are run"),
2708 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2709 ('config', "Cluster config name used for LDAP query", PARAM),
2710 ('select', "service=nodeA,service2=nodeB ", PARAMLIST),
2711 ('service', "shorthand for --group <service> --select <service>=<node>", PARAM),
2712 ('node', "Load config for <nodename>", PARAM),
2713 ('cleanup,d', "Cleans up config. (Shutdown)"),
2714 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2716 ('single_socket', "socknal option: only use one socket instead of bundle",
2718 ('failover',"""Used to shut down without saving state.
2719 This will allow this node to "give up" a service to a
2720 another node for failover purposes. This will not
2721 be a clean shutdown.""",
2723 ('abort_recovery',"""Used to start a service when you know recovery
2724 will not succeed. This will skip the recovery
2725 timeout period."""),
2726 ('gdb', """Prints message after creating gdb module script
2727 and sleeps for 5 seconds."""),
2728 ('noexec,n', """Prints the commands and steps that will be run for a
2729 config without executing them. This can used to check if a
2730 config file is doing what it should be doing"""),
2731 ('nomod', "Skip load/unload module step."),
2732 ('nosetup', "Skip device setup/cleanup step."),
2733 ('reformat', "Reformat all devices (without question)"),
2734 ('mkfsoptions', "Additional options for the mk*fs command line", PARAM),
2735 ('mountfsoptions', "Additional options for mount fs command line", PARAM),
2736 ('clientoptions', "Additional options for Lustre", PARAM),
2737 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2739 ('write_conf', "Save all the client config information on mds."),
2740 ('old_conf', "Start up service even though config logs appear outdated."),
2741 ('record', "Write config information on mds."),
2742 ('record_log', "Name of config record log.", PARAM),
2743 ('record_device', "MDS device name that will record the config commands",
2745 ('minlevel', "Minimum level of services to configure/cleanup",
2747 ('maxlevel', """Maximum level of services to configure/cleanup
2748 Levels are aproximatly like:
2753 70 - mountpoint, echo_client, osc, mdc, lov""",
2755 ('lustre', """Base directory of lustre sources. This parameter will
2756 cause lconf to load modules from a source tree.""", PARAM),
2757 ('portals', """Portals source directory. If this is a relative path,
2758 then it is assumed to be relative to lustre. """, PARAM),
2759 ('timeout', "Set recovery timeout", INTPARAM),
2760 ('upcall', "Set both portals and lustre upcall script", PARAM),
2761 ('lustre_upcall', "Set lustre upcall script", PARAM),
2762 ('portals_upcall', "Set portals upcall script", PARAM),
2763 ('group_upcall', "Set supplementary group upcall program", PARAM),
2764 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2765 ('ptldebug', "Set the portals debug level", PARAM),
2766 ('subsystem', "Set the portals debug subsystem", PARAM),
2767 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2768 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2769 ('allow_unprivileged_port', "Allow connections from unprivileged ports"),
2770 ('clumanager', "Generate CluManager config file for this node's cluster"),
2771 ('rawprimary', "For clumanager, device of the primary quorum", PARAM, "/dev/raw/raw1"),
2772 ('rawsecondary', "For clumanager, device of the secondary quorum", PARAM, "/dev/raw/raw2"),
2773 ('service_scripts', "For clumanager, directory containing per-service scripts", PARAM, "/etc/lustre/services"),
2774 ('make_service_scripts', "Create per-service symlinks for use with clumanager"),
2775 # Client recovery options
2776 ('recover', "Recover a device"),
2777 ('group,g', "The group of devices to configure or cleanup", PARAM),
2778 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2779 ('client_uuid', "The failed client (required for recovery)", PARAM),
2780 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2782 ('inactive', """The name of an inactive service, to be ignored during
2783 mounting (currently OST-only). Can be repeated.""",
2785 ('user_xattr', """Enable user_xattr support on MDS""", FLAG, 0),
2786 ('acl', """Enable ACL support on MDS""", FLAG, 0),
2787 ('quota', "Enable quota support for client file system", PARAM),
2791 global lctl, config, toplustreDB, CONFIG_FILE
2793 # in the upcall this is set to SIG_IGN
2794 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2796 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2798 config, args = cl.parse(sys.argv[1:])
2799 except Lustre.OptionError, e:
2803 setupModulePath(sys.argv[0])
2805 host = socket.gethostname()
2807 # the PRNG is normally seeded with time(), which is not so good for starting
2808 # time-synchronized clusters
2809 input = open('/dev/urandom', 'r')
2811 print 'Unable to open /dev/urandom!'
2813 seed = input.read(32)
2819 init_select(config.select)
2822 # allow config to be fetched via HTTP, but only with python2
2823 if sys.version[0] != '1' and args[0].startswith('http://'):
2826 config_file = urllib2.urlopen(args[0])
2827 except (urllib2.URLError, socket.error), err:
2828 if hasattr(err, 'args'):
2830 print "Could not access '%s': %s" %(args[0], err)
2832 elif not os.access(args[0], os.R_OK):
2833 print 'File not found or readable:', args[0]
2837 config_file = open(args[0], 'r')
2839 dom = xml.dom.minidom.parse(config_file)
2841 panic("%s does not appear to be a config file." % (args[0]))
2842 sys.exit(1) # make sure to die here, even in debug mode.
2844 CONFIG_FILE = args[0]
2845 lustreDB = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2846 if not config.config:
2847 config.config = os.path.basename(args[0])# use full path?
2848 if config.config[-4:] == '.xml':
2849 config.config = config.config[:-4]
2850 elif config.ldapurl:
2851 if not config.config:
2852 panic("--ldapurl requires --config name")
2853 dn = "config=%s,fs=lustre" % (config.config)
2854 lustreDB = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2855 elif config.ptldebug or config.subsystem:
2856 sys_set_ptldebug(None)
2857 sys_set_subsystem(None)
2860 print 'Missing config file or ldap URL.'
2861 print 'see lconf --help for command summary'
2864 if config.reformat and config.cleanup:
2865 panic("Options \"reformat\" and \"cleanup\" are incompatible. "+
2866 "Please specify only one.")
2868 toplustreDB = lustreDB
2870 ver = lustreDB.get_version()
2872 panic("No version found in config data, please recreate.")
2873 if ver != Lustre.CONFIG_VERSION:
2874 panic("Config version", ver, "does not match lconf version",
2875 Lustre.CONFIG_VERSION)
2879 node_list.append(config.node)
2882 node_list.append(host)
2883 # node_list.append('localhost')
2885 debug("configuring for host: ", node_list)
2888 config.debug_path = config.debug_path + '-' + host
2889 config.gdb_script = config.gdb_script + '-' + host
2891 lctl = LCTLInterface('lctl')
2893 if config.lctl_dump:
2894 lctl.use_save_file(config.lctl_dump)
2896 if not (config.reformat or config.write_conf or config.cleanup):
2897 doCheckMtime(lustreDB, node_list)
2900 if not (config.record_device and config.record_log):
2901 panic("When recording, both --record_log and --record_device must be specified.")
2902 lctl.clear_log(config.record_device, config.record_log)
2903 lctl.record(config.record_device, config.record_log)
2905 if config.clumanager:
2906 doClumanager(lustreDB, node_list)
2908 doHost(lustreDB, node_list)
2913 if __name__ == "__main__":
2916 except Lustre.LconfError, e:
2918 # traceback.print_exc(file=sys.stdout)
2920 except CommandError, e:
2927 if first_cleanup_error:
2928 sys.exit(first_cleanup_error)