3 # Copyright (C) 2002 Cluster File Systems, Inc.
4 # Author: Robert Read <rread@clusterfs.com>
5 # This file is part of Lustre, http://www.lustre.org.
7 # Lustre is free software; you can redistribute it and/or
8 # modify it under the terms of version 2 of the GNU General Public
9 # License as published by the Free Software Foundation.
11 # Lustre is distributed in the hope that it will be useful,
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 # GNU General Public License for more details.
16 # You should have received a copy of the GNU General Public License
17 # along with Lustre; if not, write to the Free Software
18 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 # lconf - lustre configuration tool
22 # lconf is the main driver script for starting and stopping
23 # lustre filesystem services.
25 # Based in part on the XML obdctl modifications done by Brian Behlendorf
27 import sys, getopt, types
28 import string, os, stat, popen2, socket, time, random, fcntl, select
29 import re, exceptions, signal
30 import xml.dom.minidom
32 if sys.version[0] == '1':
33 from FCNTL import F_GETFL, F_SETFL
35 from fcntl import F_GETFL, F_SETFL
37 PYMOD_DIR = "/usr/lib/lustre/python"
39 def development_mode():
40 base = os.path.dirname(sys.argv[0])
41 if os.access(base+"/Makefile.am", os.R_OK):
45 if not development_mode():
46 sys.path.append(PYMOD_DIR)
52 DEFAULT_TCPBUF = 1048576
54 # Maximum number of devices to search for.
55 # (the /dev/loop* nodes need to be created beforehand)
56 MAX_LOOP_DEVICES = 256
57 PORTALS_DIR = 'portals'
60 # Please keep these uptodate with the values in portals/kp30.h
72 "warning" : (1 << 10),
76 "portals" : (1 << 14),
78 "dlmtrace" : (1 << 16),
82 "rpctrace" : (1 << 20),
83 "vfstrace" : (1 << 21),
87 "undefined" : (0 << 24),
96 "ext2obd" : (9 << 24),
97 "portals" : (10 << 24),
98 "socknal" : (11 << 24),
99 "qswnal" : (12 << 24),
100 "pinger" : (13 << 24),
101 "filter" : (14 << 24),
102 "trace" : (15 << 24),
106 "gmnal" : (19 << 24),
107 "ptlrouter" : (20 << 24),
109 "ptlbd" : (22 << 24),
113 first_cleanup_error = 0
114 def cleanup_error(rc):
115 global first_cleanup_error
116 if not first_cleanup_error:
117 first_cleanup_error = rc
119 # ============================================================
120 # debugging and error funcs
122 def fixme(msg = "this feature"):
123 raise Lustre.LconfError, msg + ' not implmemented yet.'
126 msg = string.join(map(str,args))
127 if not config.noexec:
128 raise Lustre.LconfError(msg)
133 msg = string.join(map(str,args))
138 print string.strip(s)
142 msg = string.join(map(str,args))
146 # ack, python's builtin int() does not support '0x123' syntax.
147 # eval can do it, although what a hack!
151 return eval(s, {}, {})
154 except SyntaxError, e:
155 raise ValueError("not a number")
157 raise ValueError("not a number")
159 # ============================================================
160 # locally defined exceptions
161 class CommandError (exceptions.Exception):
162 def __init__(self, cmd_name, cmd_err, rc=None):
163 self.cmd_name = cmd_name
164 self.cmd_err = cmd_err
169 if type(self.cmd_err) == types.StringType:
171 print "! %s (%d): %s" % (self.cmd_name, self.rc, self.cmd_err)
173 print "! %s: %s" % (self.cmd_name, self.cmd_err)
174 elif type(self.cmd_err) == types.ListType:
176 print "! %s (error %d):" % (self.cmd_name, self.rc)
178 print "! %s:" % (self.cmd_name)
179 for s in self.cmd_err:
180 print "> %s" %(string.strip(s))
185 # ============================================================
186 # handle daemons, like the acceptor
188 """ Manage starting and stopping a daemon. Assumes daemon manages
189 it's own pid file. """
191 def __init__(self, cmd):
197 log(self.command, "already running.")
199 self.path = find_prog(self.command)
201 panic(self.command, "not found.")
202 ret, out = runcmd(self.path +' '+ self.command_line())
204 raise CommandError(self.path, out, ret)
208 pid = self.read_pidfile()
210 log ("killing process", pid)
212 #time.sleep(1) # let daemon die
214 log("unable to kill", self.command, e)
216 log("unable to kill", self.command)
219 pid = self.read_pidfile()
229 def read_pidfile(self):
231 fp = open(self.pidfile(), 'r')
238 def clean_pidfile(self):
239 """ Remove a stale pidfile """
240 log("removing stale pidfile:", self.pidfile())
242 os.unlink(self.pidfile())
244 log(self.pidfile(), e)
246 class AcceptorHandler(DaemonHandler):
247 def __init__(self, port, net_type, send_mem, recv_mem, irq_aff, nid_xchg):
248 DaemonHandler.__init__(self, "acceptor")
251 self.send_mem = send_mem
252 self.recv_mem = recv_mem
254 if net_type == 'toe':
255 self.flags = self.flags + ' -N 4'
257 self.flags = self.flags + ' -i'
259 self.flags = self.flags + ' -x'
262 return "/var/run/%s-%d.pid" % (self.command, self.port)
264 def command_line(self):
265 return string.join(map(str,('-s', self.send_mem, '-r', self.recv_mem, self.flags, self.port)))
269 # start the acceptors
271 for port in acceptors.keys():
272 daemon = acceptors[port]
273 if not daemon.running():
276 def run_one_acceptor(port):
277 if acceptors.has_key(port):
278 daemon = acceptors[port]
279 if not daemon.running():
282 panic("run_one_acceptor: No acceptor defined for port:", port)
284 def stop_acceptor(port):
285 if acceptors.has_key(port):
286 daemon = acceptors[port]
291 # ============================================================
292 # handle lctl interface
295 Manage communication with lctl
298 def __init__(self, cmd):
300 Initialize close by finding the lctl binary.
302 self.lctl = find_prog(cmd)
306 debug('! lctl not found')
309 raise CommandError('lctl', "unable to find lctl binary.")
311 def use_save_file(self, file):
312 self.save_file = file
314 def set_nonblock(self, fd):
315 fl = fcntl.fcntl(fd, F_GETFL)
316 fcntl.fcntl(fd, F_SETFL, fl | os.O_NDELAY)
321 the cmds are written to stdin of lctl
322 lctl doesn't return errors when run in script mode, so
324 should modify command line to accept multiple commands, or
325 create complex command line options
329 cmds = '\n dump ' + self.save_file + cmds
331 debug("+", cmd_line, cmds)
332 if config.noexec: return (0, [])
334 child = popen2.Popen3(cmd_line, 1) # Capture stdout and stderr from command
335 child.tochild.write(cmds + "\n")
336 child.tochild.close()
338 # From "Python Cookbook" from O'Reilly
339 outfile = child.fromchild
340 outfd = outfile.fileno()
341 self.set_nonblock(outfd)
342 errfile = child.childerr
343 errfd = errfile.fileno()
344 self.set_nonblock(errfd)
346 outdata = errdata = ''
349 ready = select.select([outfd,errfd],[],[]) # Wait for input
350 if outfd in ready[0]:
351 outchunk = outfile.read()
352 if outchunk == '': outeof = 1
353 outdata = outdata + outchunk
354 if errfd in ready[0]:
355 errchunk = errfile.read()
356 if errchunk == '': erreof = 1
357 errdata = errdata + errchunk
358 if outeof and erreof: break
359 # end of "borrowed" code
362 if os.WIFEXITED(ret):
363 rc = os.WEXITSTATUS(ret)
366 if rc or len(errdata):
367 raise CommandError(self.lctl, errdata, rc)
370 def runcmd(self, *args):
372 run lctl using the command line
374 cmd = string.join(map(str,args))
375 debug("+", self.lctl, cmd)
376 rc, out = run(self.lctl, cmd)
378 raise CommandError(self.lctl, out, rc)
382 def network(self, net, nid):
383 """ initialized network and add "self" """
387 quit """ % (net, nid)
390 # create a new connection
391 def connect(self, srv):
392 cmds = "\n add_uuid %s %s %s" % (srv.uuid, srv.nid, srv.net_type)
393 if srv.net_type in ('tcp', 'toe') and not config.lctl_dump:
403 connect %s %d %s""" % (cmds, srv.net_type,
406 srv.hostaddr, srv.port, flags )
408 cmds = cmds + "\n quit"
412 def recover(self, dev_name, new_conn):
416 recover %s""" %(dev_name, new_conn)
419 # add a route to a range
420 def add_route(self, net, gw, lo, hi):
429 def del_route(self, net, gw, lo, hi):
437 # add a route to a host
438 def add_route_host(self, net, uuid, gw, tgt):
448 # add a route to a range
449 def del_route_host(self, net, uuid, gw, tgt):
455 quit """ % (net, uuid, tgt)
458 # disconnect one connection
459 def disconnect(self, net, nid, port, servuuid):
465 quit""" % (net, nid, servuuid)
468 def del_uuid(self, servuuid):
472 quit""" % (servuuid,)
476 def disconnectAll(self, net):
484 # create a new device with lctl
485 def newdev(self, attach, setup = ""):
490 quit""" % (attach, setup)
494 def cleanup(self, name, uuid, force, failover = 0):
495 if failover: force = 1
501 quit""" % (name, ('', 'force')[force],
502 ('', 'failover')[failover])
506 def lov_setconfig(self, uuid, mdsuuid, stripe_cnt, stripe_sz, stripe_off,
511 lov_setconfig %s %d %d %d %s %s
512 quit""" % (mdsuuid, uuid, stripe_cnt, stripe_sz, stripe_off, pattern, devlist)
516 def dump(self, dump_file):
519 quit""" % (dump_file)
522 # get list of devices
523 def device_list(self):
525 rc, out = self.runcmd('device_list')
526 except CommandError, e:
534 def lustre_version(self):
535 rc, out = self.runcmd('version')
539 def mount_option(self, option):
544 # ============================================================
545 # Various system-level functions
546 # (ideally moved to their own module)
548 # Run a command and return the output and status.
549 # stderr is sent to /dev/null, could use popen3 to
550 # save it if necessary
553 if config.noexec: return (0, [])
554 f = os.popen(cmd + ' 2>&1')
564 cmd = string.join(map(str,args))
567 # Run a command in the background.
568 def run_daemon(*args):
569 cmd = string.join(map(str,args))
571 if config.noexec: return 0
572 f = os.popen(cmd + ' 2>&1')
580 # Determine full path to use for an external command
581 # searches dirname(argv[0]) first, then PATH
583 syspath = string.split(os.environ['PATH'], ':')
584 cmdpath = os.path.dirname(sys.argv[0])
585 syspath.insert(0, cmdpath);
587 syspath.insert(0, os.path.join(config.portals, 'utils/'))
589 prog = os.path.join(d,cmd)
590 if os.access(prog, os.X_OK):
594 # Recursively look for file starting at base dir
595 def do_find_file(base, mod):
596 fullname = os.path.join(base, mod)
597 if os.access(fullname, os.R_OK):
599 for d in os.listdir(base):
600 dir = os.path.join(base,d)
601 if os.path.isdir(dir):
602 module = do_find_file(dir, mod)
606 def find_module(src_dir, dev_dir, modname):
607 mod = '%s.o' % (modname)
608 module = src_dir +'/'+ dev_dir +'/'+ mod
610 if os.access(module, os.R_OK):
616 # is the path a block device?
623 return stat.S_ISBLK(s[stat.ST_MODE])
625 # build fs according to type
627 def mkfs(dev, devsize, fstype,jsize):
632 panic("size of filesystem on '%s' must be larger than 8MB, but is set to %s"%
634 # devsize is in 1k, and fs block count is in 4k
635 block_cnt = devsize/4
637 if fstype in ('ext3', 'extN'):
638 # ext3 journal size is in megabytes
639 if jsize: jopt = "-J size=%d" %(jsize,)
640 mkfs = 'mkfs.ext2 -j -b 4096 -F '
641 elif fstype == 'reiserfs':
642 # reiserfs journal size is in blocks
643 if jsize: jopt = "--journal_size %d" %(jsize,)
644 mkfs = 'mkreiserfs -ff'
646 print 'unsupported fs type: ', fstype
648 (ret, out) = run (mkfs, jopt, dev, block_cnt)
650 panic("Unable to build fs:", dev, string.join(out))
651 # enable hash tree indexing on fsswe
652 if fstype in ('ext3', 'extN'):
653 htree = 'echo "feature FEATURE_C5" | debugfs -w'
654 (ret, out) = run (htree, dev)
656 panic("Unable to enable htree:", dev)
658 # some systems use /dev/loopN, some /dev/loop/N
662 if not os.access(loop + str(0), os.R_OK):
664 if not os.access(loop + str(0), os.R_OK):
665 panic ("can't access loop devices")
668 # find loop device assigned to thefile
671 for n in xrange(0, MAX_LOOP_DEVICES):
673 if os.access(dev, os.R_OK):
674 (stat, out) = run('losetup', dev)
675 if out and stat == 0:
676 m = re.search(r'\((.*)\)', out[0])
677 if m and file == m.group(1):
683 # create file if necessary and assign the first free loop device
684 def init_loop(file, size, fstype, journal_size):
685 dev = find_loop(file)
687 print 'WARNING file:', file, 'already mapped to', dev
689 if config.reformat or not os.access(file, os.R_OK | os.W_OK):
691 panic("size of loopback file '%s' must be larger than 8MB, but is set to %s" % (file,size))
692 (ret, out) = run("dd if=/dev/zero bs=1k count=0 seek=%d of=%s" %(size,
695 panic("Unable to create backing store:", file)
696 mkfs(file, size, fstype, journal_size)
699 # find next free loop
700 for n in xrange(0, MAX_LOOP_DEVICES):
702 if os.access(dev, os.R_OK):
703 (stat, out) = run('losetup', dev)
705 run('losetup', dev, file)
708 print "out of loop devices"
710 print "out of loop devices"
713 # undo loop assignment
714 def clean_loop(file):
715 dev = find_loop(file)
717 ret, out = run('losetup -d', dev)
719 log('unable to clean loop device:', dev, 'for file:', file)
722 # determine if dev is formatted as a <fstype> filesystem
723 def need_format(fstype, dev):
724 # FIXME don't know how to implement this
727 # initialize a block device if needed
728 def block_dev(dev, size, fstype, format, journal_size):
729 if config.noexec: return dev
730 if not is_block(dev):
731 dev = init_loop(dev, size, fstype, journal_size)
732 elif config.reformat or (need_format(fstype, dev) and format == 'yes'):
733 mkfs(dev, size, fstype, journal_size)
736 # panic("device:", dev,
737 # "not prepared, and autoformat is not set.\n",
738 # "Rerun with --reformat option to format ALL filesystems")
743 """lookup IP address for an interface"""
744 rc, out = run("/sbin/ifconfig", iface)
747 addr = string.split(out[1])[1]
748 ip = string.split(addr, ':')[1]
751 def get_local_nid(net_type, wildcard):
752 """Return the local nid."""
754 if os.access('/proc/elan/device0/position', os.R_OK):
755 local = get_local_address('elan', '*')
757 local = get_local_address(net_type, wildcard)
760 def get_local_address(net_type, wildcard):
761 """Return the local address for the network type."""
763 if net_type in ('tcp', 'toe'):
765 iface, star = string.split(wildcard, ':')
766 local = if2addr(iface)
768 panic ("unable to determine ip for:", wildcard)
770 host = socket.gethostname()
771 local = socket.gethostbyname(host)
772 elif net_type == 'elan':
773 # awk '/NodeId/ { print $2 }' '/proc/elan/device0/position'
775 fp = open('/proc/elan/device0/position', 'r')
776 lines = fp.readlines()
785 elif net_type == 'gm':
786 fixme("automatic local address for GM")
787 elif net_type == 'scimac':
788 scinode="/opt/scali/sbin/scinode"
789 if os.path.exists(scinode):
790 (rc,local) = run(scinode)
792 panic (scinode, " not found on node with scimac networking")
794 panic (scinode, " failed")
795 local=string.rstrip(local[0])
800 # XXX: instead of device_list, ask for $name and see what we get
801 def is_prepared(name):
802 """Return true if a device exists for the name"""
805 if config.noexec and config.cleanup:
808 # expect this format:
809 # 1 UP ldlm ldlm ldlm_UUID 2
810 out = lctl.device_list()
812 if name == string.split(s)[3]:
814 except CommandError, e:
818 def is_network_prepared():
819 """If the LDLM device exists, then assume that all networking
820 has been configured"""
821 return is_prepared('ldlm')
823 def fs_is_mounted(path):
824 """Return true if path is a mounted lustre filesystem"""
826 fp = open('/proc/mounts')
827 lines = fp.readlines()
831 if a[1] == path and a[2] == 'lustre_lite':
838 # ============================================================
839 # Classes to prepare and cleanup the various objects
842 """ Base class for the rest of the modules. The default cleanup method is
843 defined here, as well as some utilitiy funcs.
845 def __init__(self, module_name, db):
847 self.module_name = module_name
848 self.name = self.db.getName()
849 self.uuid = self.db.getUUID()
850 self.kmodule_list = []
854 def info(self, *args):
855 msg = string.join(map(str,args))
856 print self.module_name + ":", self.name, self.uuid, msg
859 """ default cleanup, used for most modules """
862 lctl.cleanup(self.name, self.uuid, config.force)
863 except CommandError, e:
864 log(self.module_name, "cleanup failed: ", self.name)
868 def add_portals_module(self, dev_dir, modname):
869 """Append a module to list of modules to load."""
870 self.kmodule_list.append((config.portals, dev_dir, modname))
872 def add_lustre_module(self, dev_dir, modname):
873 """Append a module to list of modules to load."""
874 self.kmodule_list.append((config.lustre, dev_dir, modname))
876 def mod_loaded(self, modname):
877 """Check if a module is already loaded. Look in /proc/modules for it."""
878 fp = open('/proc/modules')
879 lines = fp.readlines()
881 # please forgive my tired fingers for this one
882 ret = filter(lambda word, mod=modname: word == mod,
883 map(lambda line: string.split(line)[0], lines))
886 def load_module(self):
887 """Load all the modules in the list in the order they appear."""
888 for src_dir, dev_dir, mod in self.kmodule_list:
889 # (rc, out) = run ('/sbin/lsmod | grep -s', mod)
890 if self.mod_loaded(mod) and not config.noexec:
892 log ('loading module:', mod, 'srcdir', src_dir, 'devdir', dev_dir)
894 module = find_module(src_dir, dev_dir, mod)
896 panic('module not found:', mod)
897 (rc, out) = run('/sbin/insmod', module)
899 raise CommandError('insmod', out, rc)
901 (rc, out) = run('/sbin/modprobe', mod)
903 raise CommandError('modprobe', out, rc)
905 def cleanup_module(self):
906 """Unload the modules in the list in reverse order."""
907 if not self.safe_to_clean():
909 rev = self.kmodule_list
911 for src_dir, dev_dir, mod in rev:
912 if not self.mod_loaded(mod) and not config.noexec:
915 if mod == 'portals' and config.dump:
916 lctl.dump(config.dump)
917 log('unloading module:', mod)
918 (rc, out) = run('/sbin/rmmod', mod)
920 log('! unable to unload module:', mod)
923 def safe_to_clean(self):
926 def safe_to_clean_modules(self):
927 return self.safe_to_clean()
929 class Network(Module):
930 def __init__(self,db):
931 Module.__init__(self, 'NETWORK', db)
932 self.net_type = self.db.get_val('nettype')
933 self.nid = self.db.get_val('nid', '*')
934 self.cluster_id = self.db.get_val('clusterid', "0")
935 self.port = self.db.get_val_int('port', 0)
936 self.send_mem = self.db.get_val_int('sendmem', DEFAULT_TCPBUF)
937 self.recv_mem = self.db.get_val_int('recvmem', DEFAULT_TCPBUF)
938 self.irq_affinity = self.db.get_val_int('irqaffinity', 0)
939 self.nid_exchange = self.db.get_val_int('nidexchange', 0)
942 if self.nid_exchange:
943 self.nid = get_local_nid(self.net_type, self.nid)
945 self.nid = get_local_address(self.net_type, self.nid)
947 panic("unable to set nid for", self.net_type, self.nid)
948 debug("nid:", self.nid)
950 self.hostaddr = self.db.get_val('hostaddr', self.nid)
951 if '*' in self.hostaddr:
952 self.hostaddr = get_local_address(self.net_type, self.hostaddr)
954 panic("unable to set nid for", self.net_type, self.hostaddr)
955 debug("hostaddr:", self.hostaddr)
957 self.add_portals_module("libcfs", 'portals')
958 if node_needs_router():
959 self.add_portals_module("router", 'kptlrouter')
960 if self.net_type == 'tcp':
961 self.add_portals_module("knals/socknal", 'ksocknal')
962 if self.net_type == 'toe':
963 self.add_portals_module("knals/toenal", 'ktoenal')
964 if self.net_type == 'elan':
965 self.add_portals_module("knals/qswnal", 'kqswnal')
966 if self.net_type == 'gm':
967 self.add_portals_module("knals/gmnal", 'kgmnal')
968 if self.net_type == 'scimac':
969 self.add_portals_module("knals/scimacnal", 'kscimacnal')
972 if is_network_prepared():
974 self.info(self.net_type, self.nid, self.port)
975 lctl.network(self.net_type, self.nid)
976 if self.port and node_is_router():
977 run_one_acceptor(self.port)
978 self.connect_peer_gateways()
980 def connect_peer_gateways(self):
981 for router in self.db.lookup_class('node'):
982 if router.get_val_int('router', 0):
983 # if this is a peer with a nid less than mine,
985 for netuuid in router.get_networks():
986 net = self.db.lookup(netuuid)
988 if (gw.cluster_id == self.cluster_id and
989 gw.net_type == self.net_type):
990 # hack: compare as numbers if possible, this should all
991 # go away once autoconnect is done.
992 # This also conveniently prevents us from connecting to ourself.
994 gw_nid = my_int(gw.nid)
995 self_nid = my_int(self.nid)
996 except ValueError, e:
999 if gw_nid < self_nid:
1002 def disconnect_peer_gateways(self):
1003 for router in self.db.lookup_class('node'):
1004 if router.get_val_int('router', 0):
1005 # if this is a peer with a nid less than mine,
1007 for netuuid in router.get_networks():
1008 net = self.db.lookup(netuuid)
1010 if (gw.cluster_id == self.cluster_id and
1011 gw.net_type == self.net_type):
1012 # hack: compare as numbers if possible, this should all
1013 # go away once autoconnect is done.
1014 # This also conveniently prevents us from connecting to ourself.
1016 gw_nid = my_int(gw.nid)
1017 self_nid = my_int(self.nid)
1018 except ValueError, e:
1021 if gw_nid < self_nid:
1023 lctl.disconnect(router.net_type, router.nid, router.port,
1025 except CommandError, e:
1026 print "disconnectAll failed: ", self.name
1030 def safe_to_clean(self):
1031 return not is_network_prepared()
1034 self.info(self.net_type, self.nid, self.port)
1036 stop_acceptor(self.port)
1037 if node_is_router():
1038 self.disconnect_peer_gateways()
1040 lctl.disconnectAll(self.net_type)
1041 except CommandError, e:
1042 print "disconnectAll failed: ", self.name
1046 class RouteTable(Module):
1047 def __init__(self,db):
1048 Module.__init__(self, 'ROUTES', db)
1050 if is_network_prepared():
1053 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1054 lctl.add_route(net_type, gw, lo, hi)
1055 if net_type in ('tcp', 'toe') and local_net_type(net_type) and lo == hi:
1056 srvdb = self.db.nid2server(lo, net_type)
1058 panic("no server for nid", lo)
1060 srv = Network(srvdb)
1063 def safe_to_clean(self):
1064 return not is_network_prepared()
1067 if is_network_prepared():
1068 # the network is still being used, don't clean it up
1070 for net_type, gw, gw_cluster_id, tgt_cluster_id, lo, hi in self.db.get_route_tbl():
1071 if net_type in ('tcp', 'toe') and local_net_type(net_type) and hi == '':
1072 srvdb = self.db.nid2server(lo, net_type)
1074 panic("no server for nid", lo)
1076 srv = Network(srvdb)
1078 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
1079 except CommandError, e:
1080 print "disconnect failed: ", self.name
1084 lctl.del_route(net_type, gw, lo, hi)
1085 except CommandError, e:
1086 print "del_route failed: ", self.name
1091 def __init__(self,db):
1092 Module.__init__(self, 'LDLM', db)
1093 self.add_lustre_module('obdclass', 'obdclass')
1094 self.add_lustre_module('ptlrpc', 'ptlrpc')
1095 self.add_lustre_module('ldlm', 'ldlm')
1098 if is_prepared(self.name):
1101 lctl.newdev(attach="ldlm %s %s" % ('ldlm', 'ldlm_UUID'))
1103 def safe_to_clean(self):
1104 out = lctl.device_list()
1105 return len(out) <= 1
1108 if is_prepared(self.name):
1109 Module.cleanup(self)
1112 def __init__(self, db, uuid):
1113 Module.__init__(self, 'LOV', db)
1114 self.add_lustre_module('mdc', 'mdc')
1115 self.add_lustre_module('lov', 'lov')
1116 self.mds_uuid = self.db.get_first_ref('mds')
1117 mds= self.db.lookup(self.mds_uuid)
1118 self.mds_name = mds.getName()
1119 self.stripe_sz = self.db.get_val_int('stripesize', 65536)
1120 self.stripe_off = self.db.get_val_int('stripeoffset', 0)
1121 self.pattern = self.db.get_val_int('stripepattern', 0)
1122 self.devlist = self.db.get_refs('obd')
1123 self.stripe_cnt = self.db.get_val_int('stripecount', len(self.devlist))
1125 self.client_uuid = generate_client_uuid(self.name)
1127 self.mdc = get_mdc(db, self.client_uuid, self.name, self.mds_uuid)
1128 for obd_uuid in self.devlist:
1129 obd = self.db.lookup(obd_uuid)
1130 osc = get_osc(obd, self.client_uuid, self.name)
1132 self.osclist.append(osc)
1134 panic('osc not found:', obd_uuid)
1137 if is_prepared(self.name):
1139 for osc in self.osclist:
1141 # Only ignore connect failures with --force, which
1142 # isn't implemented here yet.
1143 osc.prepare(ignore_connect_failure=0)
1144 except CommandError, e:
1145 print "Error preparing OSC %s (inactive)\n" % osc.uuid
1148 self.mdc_name = self.mdc.name
1149 self.info(self.mds_uuid, self.stripe_cnt, self.stripe_sz,
1150 self.stripe_off, self.pattern, self.devlist, self.mds_name)
1151 lctl.newdev(attach="lov %s %s" % (self.name, self.uuid),
1152 setup ="%s" % (self.mdc_name))
1155 if is_prepared(self.name):
1156 Module.cleanup(self)
1157 for osc in self.osclist:
1159 mdc = get_mdc(self.db, self.client_uuid, self.name, self.mds_uuid)
1162 def load_module(self):
1163 for osc in self.osclist:
1166 Module.load_module(self)
1168 def cleanup_module(self):
1169 Module.cleanup_module(self)
1170 for osc in self.osclist:
1171 osc.cleanup_module()
1174 class LOVConfig(Module):
1175 def __init__(self,db):
1176 Module.__init__(self, 'LOVConfig', db)
1178 self.lov_uuid = self.db.get_first_ref('lov')
1179 l = self.db.lookup(self.lov_uuid)
1180 self.lov = LOV(l, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
1184 self.info(lov.mds_uuid, lov.stripe_cnt, lov.stripe_sz, lov.stripe_off,
1185 lov.pattern, lov.devlist, lov.mds_name)
1186 lctl.lov_setconfig(lov.uuid, lov.mds_name, lov.stripe_cnt,
1187 lov.stripe_sz, lov.stripe_off, lov.pattern,
1188 string.join(lov.devlist))
1194 class MDSDEV(Module):
1195 def __init__(self,db):
1196 Module.__init__(self, 'MDSDEV', db)
1197 self.devpath = self.db.get_val('devpath','')
1198 self.size = self.db.get_val_int('devsize', 0)
1199 self.journal_size = self.db.get_val_int('journalsize', 0)
1200 self.fstype = self.db.get_val('fstype', '')
1201 self.nspath = self.db.get_val('nspath', '')
1202 # overwrite the orignal MDSDEV name and uuid with the MDS name and uuid
1203 target_uuid = self.db.get_first_ref('target')
1204 mds = self.db.lookup(target_uuid)
1205 self.name = mds.getName()
1206 self.lovconfig_uuids = mds.get_refs('lovconfig')
1207 self.filesystem_uuids = mds.get_refs('filesystem')
1208 # FIXME: if fstype not set, then determine based on kernel version
1209 self.format = self.db.get_val('autoformat', "no")
1210 if mds.get_val('failover', 0):
1211 self.failover_mds = 'f'
1213 self.failover_mds = 'n'
1214 active_uuid = get_active_target(mds)
1216 panic("No target device found:", target_uuid)
1217 if active_uuid == self.uuid:
1221 if self.active and config.group and config.group != ost.get_val('group'):
1224 self.target_dev_uuid = self.uuid
1225 self.uuid = target_uuid
1227 self.add_lustre_module('mds', 'mds')
1229 self.add_lustre_module('obdclass', 'fsfilt_%s' % (self.fstype))
1231 def load_module(self):
1233 Module.load_module(self)
1236 if is_prepared(self.name):
1239 debug(self.uuid, "not active")
1241 self.info(self.devpath, self.fstype, self.format)
1243 blkdev = block_dev(self.devpath, self.size, self.fstype, self.format,
1245 if not is_prepared('MDT'):
1246 lctl.newdev(attach="mdt %s %s" % ('MDT', 'MDT_UUID'),
1249 run ("mkdir", self.nspath)
1250 lctl.newdev(attach="mds %s %s" % (self.name, self.uuid),
1251 setup ="%s %s %s" %(blkdev, self.fstype, self.nspath))
1252 for uuid in self.lovconfig_uuids:
1253 db = self.db.lookup(uuid)
1254 lovconfig = LOVConfig(db)
1256 if config.mds_ost_conn:
1257 for uuid in self.filesystem_uuids:
1258 log("open clients for filesystem:", uuid)
1259 fs = self.db.lookup(uuid)
1260 obd_uuid = fs.get_first_ref('obd')
1261 client_uuid = generate_client_uuid(self.name)
1262 client = VOSC(client_uuid, self.db.lookup(obd_uuid), self.name)
1266 def msd_remaining(self):
1267 out = lctl.device_list()
1269 if string.split(s)[2] in ('mds',):
1272 def safe_to_clean(self):
1275 def safe_to_clean_modules(self):
1276 return not self.msd_remaining()
1280 debug(self.uuid, "not active")
1282 if is_prepared(self.name):
1285 lctl.cleanup(self.name, self.uuid, config.force,
1287 except CommandError, e:
1288 log(self.module_name, "cleanup failed: ", self.name)
1291 Module.cleanup(self)
1292 if config.mds_ost_conn:
1293 for uuid in self.filesystem_uuids:
1294 log("clean clients for filesystem:", uuid)
1295 log("open clients for filesystem:", uuid)
1296 fs = self.db.lookup(uuid)
1297 obd_uuid = fs.get_first_ref('obd')
1298 client = VOSC(self.db.lookup(obd_uuid), self.name)
1300 if not self.msd_remaining() and is_prepared('MDT'):
1302 lctl.cleanup("MDT", "MDT_UUID", config.force,
1304 except CommandError, e:
1305 print "cleanup failed: ", self.name
1308 clean_loop(self.devpath)
1311 def __init__(self, db):
1312 Module.__init__(self, 'OSD', db)
1313 self.osdtype = self.db.get_val('osdtype')
1314 self.devpath = self.db.get_val('devpath', '')
1315 self.size = self.db.get_val_int('devsize', 0)
1316 self.journal_size = self.db.get_val_int('journalsize', 0)
1317 self.fstype = self.db.get_val('fstype', '')
1318 self.nspath = self.db.get_val('nspath', '')
1319 target_uuid = self.db.get_first_ref('target')
1320 ost = self.db.lookup(target_uuid)
1321 self.name = ost.getName()
1322 self.format = self.db.get_val('autoformat', 'yes')
1323 if ost.get_val('failover', 0):
1324 self.failover_ost = 'f'
1326 self.failover_ost = 'n'
1328 active_uuid = get_active_target(ost)
1330 panic("No target device found:", target_uuid)
1331 if active_uuid == self.uuid:
1335 if self.active and config.group and config.group != ost.get_val('group'):
1338 self.target_dev_uuid = self.uuid
1339 self.uuid = target_uuid
1341 self.add_lustre_module('ost', 'ost')
1342 # FIXME: should we default to ext3 here?
1344 self.add_lustre_module('obdclass' , 'fsfilt_%s' % (self.fstype))
1345 self.add_lustre_module(self.osdtype, self.osdtype)
1347 def load_module(self):
1349 Module.load_module(self)
1351 # need to check /proc/mounts and /etc/mtab before
1352 # formatting anything.
1353 # FIXME: check if device is already formatted.
1355 if is_prepared(self.name):
1358 debug(self.uuid, "not active")
1360 self.info(self.osdtype, self.devpath, self.size, self.fstype,
1361 self.format, self.journal_size)
1363 if self.osdtype == 'obdecho':
1366 blkdev = block_dev(self.devpath, self.size, self.fstype,
1367 self.format, self.journal_size)
1369 run ("mkdir", self.nspath)
1370 lctl.newdev(attach="%s %s %s" % (self.osdtype, self.name, self.uuid),
1371 setup ="%s %s %s %s" %(blkdev, self.fstype,
1372 self.failover_ost, self.nspath))
1373 if not is_prepared('OSS'):
1374 lctl.newdev(attach="ost %s %s" % ('OSS', 'OSS_UUID'),
1377 def osd_remaining(self):
1378 out = lctl.device_list()
1380 if string.split(s)[2] in ('obdfilter', 'obdecho'):
1383 def safe_to_clean(self):
1386 def safe_to_clean_modules(self):
1387 return not self.osd_remaining()
1391 debug(self.uuid, "not active")
1393 if is_prepared(self.name):
1396 lctl.cleanup(self.name, self.uuid, config.force,
1398 except CommandError, e:
1399 log(self.module_name, "cleanup failed: ", self.name)
1402 if not self.osd_remaining() and is_prepared('OSS'):
1404 lctl.cleanup("OSS", "OSS_UUID", config.force,
1406 except CommandError, e:
1407 print "cleanup failed: ", self.name
1410 if not self.osdtype == 'obdecho':
1411 clean_loop(self.devpath)
1413 # Generic client module, used by OSC and MDC
1414 class Client(Module):
1415 def __init__(self, tgtdb, uuid, module, owner):
1416 self.target_name = tgtdb.getName()
1417 self.target_uuid = tgtdb.getUUID()
1420 self.tgt_dev_uuid = get_active_target(tgtdb)
1421 if not self.tgt_dev_uuid:
1422 panic("No target device found for target:", self.target_name)
1424 self.kmodule_list = []
1428 self.module = module
1429 self.module_name = string.upper(module)
1430 self.name = '%s_%s_%s_%s' % (self.module_name, socket.gethostname(),
1431 self.target_name, owner)
1433 self.lookup_server(self.tgt_dev_uuid)
1434 self.add_lustre_module(module, module)
1436 def lookup_server(self, srv_uuid):
1437 """ Lookup a server's network information """
1438 self._server_nets = get_ost_net(self.db, srv_uuid)
1439 if len(self._server_nets) == 0:
1440 panic ("Unable to find a server for:", srv_uuid)
1442 def get_servers(self):
1443 return self._server_nets
1445 def prepare(self, ignore_connect_failure = 0):
1446 self.info(self.target_uuid)
1447 if is_prepared(self.name):
1450 srv = choose_local_server(self.get_servers())
1454 srv, r = find_route(self.get_servers())
1456 lctl.add_route_host(r[0], srv.uuid, r[1], r[3])
1458 panic ("no route to", self.target_uuid)
1459 except CommandError, e:
1460 if not ignore_connect_failure:
1463 lctl.newdev(attach="%s %s %s" % (self.module, self.name, self.uuid),
1464 setup ="%s %s" %(self.target_uuid, srv.uuid))
1467 if is_prepared(self.name):
1468 Module.cleanup(self)
1470 srv = choose_local_server(self.get_servers())
1472 lctl.disconnect(srv.net_type, srv.nid, srv.port, srv.uuid)
1474 srv, r = find_route(self.get_servers())
1476 lctl.del_route_host(r[0], srv.uuid, r[1], r[2])
1477 except CommandError, e:
1478 log(self.module_name, "cleanup failed: ", self.name)
1484 def __init__(self, db, uuid, owner):
1485 Client.__init__(self, db, uuid, 'mdc', owner)
1488 def __init__(self, db, uuid, owner):
1489 Client.__init__(self, db, uuid, 'osc', owner)
1493 def __init__(self, db):
1494 Module.__init__(self, 'COBD', db)
1495 self.real_uuid = self.db.get_first_ref('realobd')
1496 self.cache_uuid = self.db.get_first_ref('cacheobd')
1497 self.add_lustre_module('cobd' , 'cobd')
1499 # need to check /proc/mounts and /etc/mtab before
1500 # formatting anything.
1501 # FIXME: check if device is already formatted.
1503 if is_prepared(self.name):
1505 self.info(self.real_uuid, self.cache_uuid)
1506 lctl.newdev(attach="cobd %s %s" % (self.name, self.uuid),
1507 setup ="%s %s" %(self.real_uuid, self.cache_uuid))
1510 # virtual interface for OSC and LOV
1512 def __init__(self, db, uuid, owner):
1513 Module.__init__(self, 'VOSC', db)
1514 if db.get_class() == 'lov':
1515 self.osc = LOV(db, uuid)
1517 self.osc = get_osc(db, uuid, owner)
1519 return self.osc.uuid
1521 return self.osc.name
1526 def load_module(self):
1527 self.osc.load_module()
1528 def cleanup_module(self):
1529 self.osc.cleanup_module()
1531 return self.db.get_class() != 'lov'
1532 def get_mdc_name(self):
1533 if self.db.get_class() == 'lov':
1534 return self.osc.mdc_name
1538 class ECHO_CLIENT(Module):
1539 def __init__(self,db):
1540 Module.__init__(self, 'ECHO_CLIENT', db)
1541 self.add_lustre_module('obdecho', 'obdecho')
1542 self.obd_uuid = self.db.get_first_ref('obd')
1543 obd = self.db.lookup(self.obd_uuid)
1544 self.osc = VOSC(obd, self.uuid, self.name)
1547 if is_prepared(self.name):
1549 self.osc.prepare() # XXX This is so cheating. -p
1550 self.info(self.obd_uuid)
1552 lctl.newdev(attach="echo_client %s %s" % (self.name, self.uuid),
1553 setup = self.osc.get_name())
1556 if is_prepared(self.name):
1557 Module.cleanup(self)
1560 def load_module(self):
1561 self.osc.load_module()
1562 Module.load_module(self)
1563 def cleanup_module(self):
1564 Module.cleanup_module(self)
1565 self.osc.cleanup_module()
1567 def generate_client_uuid(name):
1568 client_uuid = '%05x_%.19s_%05x%05x' % (int(random.random() * 1048576),
1570 int(random.random() * 1048576),
1571 int(random.random() * 1048576))
1572 return client_uuid[:36]
1574 class Mountpoint(Module):
1575 def __init__(self,db):
1576 Module.__init__(self, 'MTPT', db)
1577 self.path = self.db.get_val('path')
1578 self.fs_uuid = self.db.get_first_ref('filesystem')
1579 fs = self.db.lookup(self.fs_uuid)
1580 self.mds_uuid = fs.get_first_ref('mds')
1581 self.obd_uuid = fs.get_first_ref('obd')
1582 obd = self.db.lookup(self.obd_uuid)
1583 client_uuid = generate_client_uuid(self.name)
1584 self.vosc = VOSC(obd, client_uuid, self.name)
1585 if self.vosc.need_mdc():
1586 self.add_lustre_module('mdc', 'mdc')
1587 self.mdc = get_mdc(db, client_uuid, self.name, self.mds_uuid)
1588 self.add_lustre_module('llite', 'llite')
1592 if fs_is_mounted(self.path):
1593 log(self.path, "already mounted.")
1596 if self.vosc.need_mdc():
1598 mdc_name = self.mdc.name
1600 mdc_name = self.vosc.get_mdc_name()
1603 panic("Unable to determine MDC name. Probably need to cleanup before re-mounting.")
1604 self.info(self.path, self.mds_uuid, self.obd_uuid)
1605 if config.lctl_dump:
1606 cmd = "osc=%s,mdc=%s" % (self.vosc.get_name(), mdc_name)
1607 lctl.mount_option(cmd)
1609 cmd = "mount -t lustre_lite -o osc=%s,mdc=%s %s %s" % \
1610 (self.vosc.get_name(), mdc_name, config.config, self.path)
1611 run("mkdir", self.path)
1615 if self.vosc.need_mdc():
1617 panic("mount failed:", self.path, ":", string.join(val))
1620 self.info(self.path, self.mds_uuid,self.obd_uuid)
1621 if fs_is_mounted(self.path):
1623 (rc, out) = run("umount", "-f", self.path)
1625 (rc, out) = run("umount", self.path)
1627 raise CommandError('umount', out, rc)
1629 if fs_is_mounted(self.path):
1630 panic("fs is still mounted:", self.path)
1633 if self.vosc.need_mdc():
1636 def load_module(self):
1637 self.vosc.load_module()
1638 Module.load_module(self)
1639 def cleanup_module(self):
1640 Module.cleanup_module(self)
1641 self.vosc.cleanup_module()
1644 # ============================================================
1645 # misc query functions
1647 def get_ost_net(self, osd_uuid):
1651 osd = self.lookup(osd_uuid)
1652 node_uuid = osd.get_first_ref('node')
1653 node = self.lookup(node_uuid)
1655 panic("unable to find node for osd_uuid:", osd_uuid,
1656 " node_ref:", node_uuid)
1657 for net_uuid in node.get_networks():
1658 db = node.lookup(net_uuid)
1659 srv_list.append(Network(db))
1663 # the order of iniitailization is based on level.
1664 def getServiceLevel(self):
1665 type = self.get_class()
1667 if type in ('network',):
1669 elif type in ('routetbl',):
1671 elif type in ('ldlm',):
1673 elif type in ('osd', 'cobd'):
1675 elif type in ('mdsdev',):
1677 elif type in ('mountpoint', 'echoclient'):
1680 panic("Unknown type: ", type)
1682 if ret < config.minlevel or ret > config.maxlevel:
1687 # return list of services in a profile. list is a list of tuples
1688 # [(level, db_object),]
1689 def getServices(self):
1691 for ref_class, ref_uuid in self.get_all_refs():
1692 servdb = self.lookup(ref_uuid)
1694 level = getServiceLevel(servdb)
1696 list.append((level, servdb))
1698 panic('service not found: ' + ref_uuid)
1704 ############################################################
1706 # FIXME: clean this mess up!
1708 # OSC is no longer in the xml, so we have to fake it.
1709 # this is getting ugly and begging for another refactoring
1710 def get_osc(ost_db, uuid, owner):
1711 osc = OSC(ost_db, uuid, owner)
1714 def get_mdc(db, uuid, owner, mds_uuid):
1715 mds_db = db.lookup(mds_uuid);
1717 panic("no mds:", mds_uuid)
1718 mdc = MDC(mds_db, uuid, owner)
1721 ############################################################
1722 # routing ("rooting")
1724 # list of (nettype, cluster_id)
1727 def find_local_clusters(node_db):
1728 global local_clusters
1729 for netuuid in node_db.get_networks():
1730 net = node_db.lookup(netuuid)
1732 debug("add_local", netuuid)
1733 local_clusters.append((srv.net_type, srv.cluster_id))
1735 if acceptors.has_key(srv.port):
1736 panic("duplicate port:", srv.port)
1737 acceptors[srv.port] = AcceptorHandler(srv.port, srv.net_type,
1738 srv.send_mem, srv.recv_mem,
1742 # This node is a gateway.
1744 def node_is_router():
1747 # If there are any routers found in the config, then this will be true
1748 # and all nodes will load kptlrouter.
1750 def node_needs_router():
1751 return needs_router or is_router
1753 # list of (nettype, gw, tgt_cluster_id, lo, hi)
1754 # Currently, these local routes are only added to kptlrouter route
1755 # table if they are needed to connect to a specific server. This
1756 # should be changed so all available routes are loaded, and the
1757 # ptlrouter can make all the decisions.
1760 def find_local_routes(lustre):
1761 """ Scan the lustre config looking for routers . Build list of
1763 global local_routes, needs_router
1765 list = lustre.lookup_class('node')
1767 if router.get_val_int('router', 0):
1769 for (local_type, local_cluster_id) in local_clusters:
1771 for netuuid in router.get_networks():
1772 db = router.lookup(netuuid)
1773 if (local_type == db.get_val('nettype') and
1774 local_cluster_id == db.get_val('clusterid')):
1775 gw = db.get_val('nid')
1778 debug("find_local_routes: gw is", gw)
1779 for route in router.get_local_routes(local_type, gw):
1780 local_routes.append(route)
1781 debug("find_local_routes:", local_routes)
1784 def choose_local_server(srv_list):
1785 for srv in srv_list:
1786 if local_net_type(srv.net_type):
1789 def local_net_type(net_type):
1790 for cluster in local_clusters:
1791 if net_type == cluster[0]:
1795 def find_route(srv_list):
1796 frm_type = local_clusters[0][0]
1797 for srv in srv_list:
1798 debug("find_route: srv:", srv.hostaddr, "type: ", srv.net_type)
1799 to_type = srv.net_type
1800 to = srv.hostaddr # XXX should this be hostaddr, or nid?
1801 cluster_id = srv.cluster_id
1802 debug ('looking for route to', to_type, to)
1803 for r in local_routes:
1804 debug("find_route: ", r)
1805 if (r[3] <= to and to <= r[4]) and cluster_id == r[2]:
1809 def get_active_target(db):
1810 target_uuid = db.getUUID()
1811 target_name = db.getName()
1812 node_name = get_select(target_name)
1814 tgt_dev_uuid = db.get_node_tgt_dev(node_name, target_uuid)
1816 tgt_dev_uuid = db.get_first_ref('active')
1820 ############################################################
1824 type = db.get_class()
1825 debug('Service:', type, db.getName(), db.getUUID())
1830 n = LOV(db, "YOU_SHOULD_NEVER_SEE_THIS_UUID")
1831 elif type == 'network':
1833 elif type == 'routetbl':
1837 elif type == 'cobd':
1839 elif type == 'mdsdev':
1841 elif type == 'mountpoint':
1843 elif type == 'echoclient':
1846 panic ("unknown service type:", type)
1850 # Prepare the system to run lustre using a particular profile
1851 # in a the configuration.
1852 # * load & the modules
1853 # * setup networking for the current node
1854 # * make sure partitions are in place and prepared
1855 # * initialize devices with lctl
1856 # Levels is important, and needs to be enforced.
1857 def for_each_profile(db, prof_list, operation):
1858 for prof_uuid in prof_list:
1859 prof_db = db.lookup(prof_uuid)
1861 panic("profile:", profile, "not found.")
1862 services = getServices(prof_db)
1865 def doSetup(services):
1869 n = newService(s[1])
1872 def doModules(services):
1876 n = newService(s[1])
1879 def doCleanup(services):
1884 n = newService(s[1])
1885 if n.safe_to_clean():
1888 def doUnloadModules(services):
1893 n = newService(s[1])
1894 if n.safe_to_clean_modules():
1899 def doHost(lustreDB, hosts):
1903 node_db = lustreDB.lookup_name(h, 'node')
1907 print 'No host entry found.'
1910 is_router = node_db.get_val_int('router', 0)
1911 lustre_upcall = node_db.get_val('lustreUpcall', '')
1912 portals_upcall = node_db.get_val('portalsUpcall', '')
1913 timeout = node_db.get_val_int('timeout', 0)
1915 find_local_clusters(node_db)
1917 find_local_routes(lustreDB)
1919 # Two step process: (1) load modules, (2) setup lustre
1920 # if not cleaning, load modules first.
1921 prof_list = node_db.get_refs('profile')
1924 if not (config.tgt_uuid and config.client_uuid and config.conn_uuid):
1925 raise Lustre.LconfError( "--recovery requires --tgt_uuid <UUID> " +
1926 "--client_uuid <UUID> --conn_uuid <UUID>")
1927 doRecovery(lustreDB, lctl, config.tgt_uuid, config.client_uuid,
1929 elif config.cleanup:
1931 # the command line can override this value
1933 # ugly hack, only need to run lctl commands for --dump
1934 if config.lctl_dump:
1935 for_each_profile(node_db, prof_list, doCleanup)
1938 sys_set_timeout(timeout)
1941 sys_set_lustre_upcall(lustre_upcall)
1942 sys_set_portals_upcall(portals_upcall)
1944 for_each_profile(node_db, prof_list, doCleanup)
1945 for_each_profile(node_db, prof_list, doUnloadModules)
1948 # ugly hack, only need to run lctl commands for --dump
1949 if config.lctl_dump:
1950 for_each_profile(node_db, prof_list, doSetup)
1954 sys_set_netmem_max('/proc/sys/net/core/rmem_max', MAXTCPBUF)
1955 sys_set_netmem_max('/proc/sys/net/core/wmem_max', MAXTCPBUF)
1957 for_each_profile(node_db, prof_list, doModules)
1959 sys_set_debug_path()
1962 script = config.gdb_script
1963 run(lctl.lctl, ' modules >', script)
1965 log ("The GDB module script is in", script)
1966 # pause, so user has time to break and
1969 sys_set_timeout(timeout)
1970 sys_set_lustre_upcall(lustre_upcall)
1971 sys_set_portals_upcall(portals_upcall)
1973 for_each_profile(node_db, prof_list, doSetup)
1975 def doRecovery(db, lctl, tgt_uuid, client_uuid, conn_uuid):
1976 tgt = db.lookup(tgt_uuid)
1978 raise Lustre.LconfError("doRecovery: "+ tgt_uuid +" not found.")
1979 new_uuid = get_active_target(tgt)
1981 raise Lustre.LconfError("doRecovery: no active target found for: " +
1983 net = choose_local_server(get_ost_net(db, new_uuid))
1985 raise Lustre.LconfError("Unable to find a connection to:" + new_uuid)
1986 # XXX, better to do a full disconnect here
1987 log("Reconnecting", tgt_uuid, " to ", net.uuid);
1988 lctl.del_uuid(conn_uuid)
1990 lctl.recover(client_uuid, net.uuid)
1993 def setupModulePath(cmd, portals_dir = PORTALS_DIR):
1994 base = os.path.dirname(cmd)
1995 if development_mode():
1996 if not config.lustre:
1997 config.lustre = (os.path.join(base, ".."))
1998 # normalize the portals dir, using command line arg if set
2000 portals_dir = config.portals
2001 dir = os.path.join(config.lustre, portals_dir)
2002 config.portals = dir
2003 debug('config.portals', config.portals)
2004 elif config.lustre and config.portals:
2006 # if --lustre and --portals, normalize portals
2007 # can ignore POTRALS_DIR here, since it is probly useless here
2008 config.portals = os.path.join(config.lustre, config.portals)
2009 debug('config.portals B', config.portals)
2011 def sysctl(path, val):
2012 debug("+ sysctl", path, val)
2016 fp = open(os.path.join('/proc/sys', path), 'w')
2023 def sys_set_debug_path():
2024 sysctl('portals/debug_path', config.debug_path)
2026 def sys_set_lustre_upcall(upcall):
2027 # the command overrides the value in the node config
2028 if config.lustre_upcall:
2029 upcall = config.lustre_upcall
2031 upcall = config.upcall
2033 sysctl('lustre/upcall', upcall)
2035 def sys_set_portals_upcall(upcall):
2036 # the command overrides the value in the node config
2037 if config.portals_upcall:
2038 upcall = config.portals_upcall
2040 upcall = config.upcall
2042 sysctl('portals/upcall', upcall)
2044 def sys_set_timeout(timeout):
2045 # the command overrides the value in the node config
2046 if config.timeout > 0:
2047 timeout = config.timeout
2048 if timeout != None and timeout > 0:
2049 sysctl('lustre/timeout', timeout)
2051 def sys_set_ptldebug():
2052 if config.ptldebug != None:
2054 val = eval(config.ptldebug, ptldebug_names)
2055 val = "0x%x" % (val,)
2056 sysctl('portals/debug', val)
2057 except NameError, e:
2060 def sys_set_subsystem():
2061 if config.subsystem != None:
2063 val = eval(config.ptldebug, ptldebug_names)
2064 val = "0x%x" % (val,)
2065 sysctl('portals/subsystem_debug', val)
2066 except NameError, e:
2069 def sys_set_netmem_max(path, max):
2070 debug("setting", path, "to at least", max)
2078 fp = open(path, 'w')
2079 fp.write('%d\n' %(max))
2083 def sys_make_devices():
2084 if not os.access('/dev/portals', os.R_OK):
2085 run('mknod /dev/portals c 10 240')
2086 if not os.access('/dev/obd', os.R_OK):
2087 run('mknod /dev/obd c 10 241')
2090 # Add dir to the global PATH, if not already there.
2091 def add_to_path(new_dir):
2092 syspath = string.split(os.environ['PATH'], ':')
2093 if new_dir in syspath:
2095 os.environ['PATH'] = os.environ['PATH'] + ':' + new_dir
2097 def default_debug_path():
2098 path = '/tmp/lustre-log'
2099 if os.path.isdir('/r'):
2104 def default_gdb_script():
2105 script = '/tmp/ogdb'
2106 if os.path.isdir('/r'):
2107 return '/r' + script
2112 DEFAULT_PATH = ('/sbin', '/usr/sbin', '/bin', '/usr/bin')
2113 # ensure basic elements are in the system path
2114 def sanitise_path():
2115 for dir in DEFAULT_PATH:
2118 # global hack for the --select handling
2120 def init_select(arg):
2121 # arg = "service=nodeA,service2=nodeB"
2124 list = string.split(arg, ',')
2126 srv, node = string.split(entry, '=')
2127 tgt_select[srv] = node
2129 def get_select(srv):
2130 if tgt_select.has_key(srv):
2131 return tgt_select[srv]
2135 PARAM = Lustre.Options.PARAM
2136 INTPARAM = Lustre.Options.INTPARAM
2138 ('verbose,v', "Print system commands as they are run"),
2139 ('ldapurl',"LDAP server URL, eg. ldap://localhost", PARAM),
2140 ('config', "Cluster config name used for LDAP query", PARAM),
2141 ('select', "service=nodeA,service2=nodeB ", PARAM),
2142 ('node', "Load config for <nodename>", PARAM),
2143 ('cleanup,d', "Cleans up config. (Shutdown)"),
2144 ('force,f', "Forced unmounting and/or obd detach during cleanup",
2145 Lustre.Options.FLAG, 0),
2146 ('mds_ost_conn', "Open connections to OSTs on the MDS"),
2147 ('failover',"""Used to shut down without saving state.
2148 This will allow this node to "give up" a service to a
2149 another node for failover purposes. This will not
2150 be a clean shutdown.""",
2151 Lustre.Options.FLAG, 0),
2152 ('gdb', """Prints message after creating gdb module script
2153 and sleeps for 5 seconds."""),
2154 ('noexec,n', """Prints the commands and steps that will be run for a
2155 config without executing them. This can used to check if a
2156 config file is doing what it should be doing"""),
2157 ('nomod', "Skip load/unload module step."),
2158 ('nosetup', "Skip device setup/cleanup step."),
2159 ('reformat', "Reformat all devices (without question)"),
2160 ('dump', "Dump the kernel debug log to file before portals is unloaded",
2162 ('minlevel', "Minimum level of services to configure/cleanup",
2164 ('maxlevel', """Maximum level of services to configure/cleanup
2165 Levels are aproximatly like:
2170 70 - mountpoint, echo_client, osc, mdc, lov""",
2172 ('lustre', """Base directory of lustre sources. This parameter will
2173 cause lconf to load modules from a source tree.""", PARAM),
2174 ('portals', """Portals source directory. If this is a relative path,
2175 then it is assumed to be relative to lustre. """, PARAM),
2176 ('timeout', "Set recovery timeout", PARAM),
2177 ('upcall', "Set both portals and lustre upcall script", PARAM),
2178 ('lustre_upcall', "Set lustre upcall script", PARAM),
2179 ('portals_upcall', "Set portals upcall script", PARAM),
2180 ('lctl_dump', "Save lctl ioctls to the dumpfile argument", PARAM),
2181 ('ptldebug', "Set the portals debug level", PARAM),
2182 ('subsystem', "Set the portals debug subsystem", PARAM),
2183 ('gdb_script', "Fullname of gdb debug script", PARAM, default_gdb_script()),
2184 ('debug_path', "Path to save debug dumps", PARAM, default_debug_path()),
2185 # Client recovery options
2186 ('recover', "Recover a device"),
2187 ('group', "The group of devices to configure or cleanup", PARAM),
2188 ('tgt_uuid', "The failed target (required for recovery)", PARAM),
2189 ('client_uuid', "The failed client (required for recovery)", PARAM),
2190 ('conn_uuid', "The failed connection (required for recovery)", PARAM),
2196 # in the upcall this is set to SIG_IGN
2197 signal.signal(signal.SIGCHLD, signal.SIG_DFL)
2199 cl = Lustre.Options("lconf", "config.xml", lconf_options)
2201 config, args = cl.parse(sys.argv[1:])
2202 except Lustre.OptionError, e:
2206 setupModulePath(sys.argv[0])
2208 host = socket.gethostname()
2210 # the PRNG is normally seeded with time(), which is not so good for starting
2211 # time-synchronized clusters
2212 input = open('/dev/urandom', 'r')
2214 print 'Unable to open /dev/urandom!'
2216 seed = input.read(32)
2222 init_select(config.select)
2225 if not os.access(args[0], os.R_OK):
2226 print 'File not found or readable:', args[0]
2229 dom = xml.dom.minidom.parse(args[0])
2231 panic("%s does not appear to be a config file." % (args[0]))
2232 sys.exit(1) # make sure to die here, even in debug mode.
2233 db = Lustre.LustreDB_XML(dom.documentElement, dom.documentElement)
2234 if not config.config:
2235 config.config = os.path.basename(args[0])# use full path?
2236 if config.config[-4:] == '.xml':
2237 config.config = config.config[:-4]
2238 elif config.ldapurl:
2239 if not config.config:
2240 panic("--ldapurl requires --config name")
2241 dn = "config=%s,fs=lustre" % (config.config)
2242 db = Lustre.LustreDB_LDAP('', {}, base=dn, url = config.ldapurl)
2247 ver = db.get_version()
2249 panic("No version found in config data, please recreate.")
2250 if ver != Lustre.CONFIG_VERSION:
2251 panic("Config version", ver, "does not match lconf version",
2252 Lustre.CONFIG_VERSION)
2256 node_list.append(config.node)
2259 node_list.append(host)
2260 node_list.append('localhost')
2262 debug("configuring for host: ", node_list)
2265 config.debug_path = config.debug_path + '-' + host
2266 config.gdb_script = config.gdb_script + '-' + host
2268 lctl = LCTLInterface('lctl')
2270 if config.lctl_dump:
2271 lctl.use_save_file(config.lctl_dump)
2273 doHost(db, node_list)
2275 if __name__ == "__main__":
2278 except Lustre.LconfError, e:
2280 except CommandError, e:
2284 if first_cleanup_error:
2285 sys.exit(first_cleanup_error)