loris_crontab \
loris_test \
lpcc \
- lpcc_cleanup \
- lpcc_test \
pyltest_import_check
EXTRA_DIST= \
pylipe/.pylintrc \
pylipe/*.py \
pyloris/*.py \
- pylpcc/*.py \
pylustre/*.py \
pyltest/*.py \
scripts/*.sh \
.pylintrc
PYLTEST_FILES = $(wildcard pyltest/*.py)
-PYTHON_LIB_FILES = $(wildcard pyclownfish/*.py pylustre/*.py pyloris/*.py pylhsm/*.py pylpcc/*.py)
+PYTHON_LIB_FILES = $(wildcard pyclownfish/*.py pylustre/*.py pyloris/*.py pylhsm/*.py)
PYTHON_LIB_FILES += $(PYLTEST_FILES)
PYTHON_FILES = $(PYTHON_LIB_FILES) $(PYTHON_COMMANDS)
PYTHON_CHECKS = $(PYTHON_FILES:%=%.python_checked)
python2 -m py_compile pylhsm/*.py
python2 -m py_compile pylipe/*.py
python2 -m py_compile pyloris/*.py
-python2 -m py_compile pylpcc/*.py
python2 -m py_compile pyltest/*.py
-find pyclownfish pylustre pylhsm pylipe pyloris pylpcc pyltest -maxdepth 1 -type f -a -name "*.python_checked" -o -name "*.py" | xargs rm -f
+find pyclownfish pylustre pylhsm pylipe pyloris pyltest -maxdepth 1 -type f -a -name "*.python_checked" -o -name "*.py" | xargs rm -f
%install
rm -rf $RPM_BUILD_ROOT
mkdir -p $RPM_BUILD_ROOT%{python2_sitelib}
mkdir -p $RPM_BUILD_ROOT%{_mandir}/man1
mkdir -p $RPM_BUILD_ROOT%{_mandir}/man5
+mkdir -p $RPM_BUILD_ROOT%{_mandir}/man8
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/yum.repos.d
cp \
ldsync \
loris_crontab \
loris_test \
lpcc \
- lpcc_cleanup \
- lpcc_test \
+ src/lpcc_purge \
src/ext4_inode2path \
src/lcreatemany \
src/ldumpstripe \
cp -a pylhsm $RPM_BUILD_ROOT%{python2_sitelib}
cp -a pylipe $RPM_BUILD_ROOT%{python2_sitelib}
cp -a pyloris $RPM_BUILD_ROOT%{python2_sitelib}
-cp -a pylpcc $RPM_BUILD_ROOT%{python2_sitelib}
cp -a pylustre $RPM_BUILD_ROOT%{python2_sitelib}
cp -a pyltest $RPM_BUILD_ROOT%{python2_sitelib}
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}
example_configs/clownfish/seperate_mgs/lipe_virt.conf \
lpcc.conf \
$RPM_BUILD_ROOT%{_sysconfdir}
+
%if %{with laudit}
mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/laudit
cp -a laudit.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/laudit
install -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/
install -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/
install -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/
+install -m 0644 man/lpcc.8 $RPM_BUILD_ROOT%{_mandir}/man8/
+install -m 0644 man/lpcc-start.8 $RPM_BUILD_ROOT%{_mandir}/man8/
+install -m 0644 man/lpcc-stop.8 $RPM_BUILD_ROOT%{_mandir}/man8/
+install -m 0644 man/lpcc-status.8 $RPM_BUILD_ROOT%{_mandir}/man8/
+install -m 0644 man/lpcc.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/
%if %{with laudit}
install -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/
install -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/
%files lpcc
%defattr(-,root,root)
-%{python2_sitelib}/pylpcc
%{_bindir}/lpcc
-%{_bindir}/lpcc_cleanup
-%{_bindir}/lpcc_test
+%{_bindir}/lpcc_purge
%config(noreplace) %{_sysconfdir}/lpcc.conf
%if %{with systemd}
%{_unitdir}/lpcc.service
%else
%{_sysconfdir}/rc.d/init.d/lpcc
%endif
+%{_mandir}/man8/lpcc.8*
+%{_mandir}/man8/lpcc-start.8*
+%{_mandir}/man8/lpcc-stop.8*
+%{_mandir}/man8/lpcc-status.8*
+%{_mandir}/man5/lpcc.conf.5*
+
+
%files hsm
%defattr(-,root,root)
-#!/usr/bin/python2 -u
-# Copyright (c) 2017 DataDirect Networks, Inc.
+#!/usr/bin/env python3
+# Copyright (c) 2021 DataDirect Networks, Inc.
# All Rights Reserved.
-# Author: lixi@ddn.com
+# Author: flei@ddn.com
+
"""
-LPCC(Lustre Persistent Client Cache)
+Manage all PCC devices and services
"""
-from pylpcc import lpcc
+
+import argparse
+import errno
+import json
+import os
+import select
+import signal
+import socket
+import subprocess
+import sys
+import time
+import yaml
+
+
+LISTEN_SOCK_FN = "/var/run/lpcc.sock"
+LISTEN_SOCK = None
+
+def eprint(*args, **kwargs):
+ """print something to stderr"""
+ print(*args, file=sys.stderr, **kwargs)
+
+class LpccService:
+ """
+ Class to manage single instance of lpcc
+ """
+ copytool_prog = 'lhsmtool_posix'
+ lpcc_purge_prog = 'lpcc_purge'
+
+ lpcc_config = None
+
+ lpcc_mount = None
+ lpcc_cache = None
+ lpcc_roid = None
+ lpcc_autocache = None
+
+ lpcc_purge_high_usage = 90
+ lpcc_purge_low_usage = 75
+ lpcc_purge_interval = 5
+ lpcc_purge_scan_threads = 1
+
+ def __init__(self, lpcc_config):
+ self.lpcc_config = lpcc_config
+ self.lpcc_mount = lpcc_config['mount']
+ self.lpcc_cache = lpcc_config['cache']
+ self.lpcc_autocache = lpcc_config['autocache']
+ self.lpcc_roid = lpcc_config['roid']
+
+ lpcc_purge_obj = lpcc_config.get('purge')
+ if lpcc_purge_obj is not None:
+ self.lpcc_purge_high_usage = lpcc_purge_obj.get('high_usage', 90)
+ self.lpcc_purge_low_usage = lpcc_purge_obj.get('low_usage', 75)
+ self.lpcc_purge_interval = lpcc_purge_obj.get('interval', 30)
+ self.lpcc_purge_scan_threads = lpcc_purge_obj.get('scan_threads', 1)
+
+ @staticmethod
+ def _check_process_by_pidfile(procname, pidfile):
+ """
+ Check existence of process with the pid in pidfile.
+ If pidfile does not exist or is not valid, return False.
+ """
+ cmdline = ['pkill', '--signal', '0', '--pidfile', pidfile, '--', procname]
+ eprint(cmdline)
+ cproc = subprocess.run(cmdline, check=False, stderr=subprocess.DEVNULL)
+ if cproc.returncode != 0:
+ return False
+
+ return True
+
+ @staticmethod
+ def _wait_process_by_pidfile(procname, pidfile, secs=5):
+ """
+ Wait for at most secs seconds for the existence of pid in pidfile
+ """
+ for i in range(secs):
+ if LpccService._check_process_by_pidfile(procname, pidfile):
+ return True
+ else:
+ time.sleep(1)
+
+ return False
+
+ @staticmethod
+ def _kill_process_by_pidfile(procname, pidfile):
+ """
+ Kill a process with given pid in pidfile
+ """
+ cmdline = ['pkill', '--pidfile', pidfile, '--', procname]
+ eprint(cmdline)
+ cproc = subprocess.run(cmdline, check=False)
+ return cproc.returncode
+
+ def _add_pcc(self):
+ eprint("Adding PCC...")
+
+ param = '%s roid=%d ropcc=1' % (self.lpcc_autocache, self.lpcc_roid)
+ cmdline = ['lctl', 'pcc', 'add', self.lpcc_mount, self.lpcc_cache, \
+ '--param', param]
+ eprint(cmdline)
+ cproc = subprocess.run(cmdline, check=False)
+ return cproc.returncode
+
+ def _del_pcc(self):
+ eprint("Deleting PCC...")
+ cmdline = ['lctl', 'pcc', 'del', self.lpcc_mount, self.lpcc_cache]
+ eprint(cmdline)
+ cproc = subprocess.run(cmdline, check=False)
+ return cproc.returncode
+
+ def _start_lpcc_purge(self):
+ eprint("Starting lpcc_purge...")
+
+ pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid
+ cmdline = [self.lpcc_purge_prog, \
+ '--mount', self.lpcc_mount, \
+ '--cache', self.lpcc_cache, \
+ '--roid', str(self.lpcc_roid), \
+ '--high-usage', str(self.lpcc_purge_high_usage), \
+ '--low-usage', str(self.lpcc_purge_low_usage), \
+ '--interval', str(self.lpcc_purge_interval), \
+ '--scan-threads', str(self.lpcc_purge_scan_threads), \
+ '--pidfile', pidfile]
+
+ eprint(cmdline)
+ subprocess.Popen(cmdline)
+
+ succ = LpccService._wait_process_by_pidfile(self.lpcc_purge_prog, pidfile)
+ if not succ:
+ eprint("lpcc_purge did not start successfully!")
+ return 1
+
+ return 0
+
+ def _stop_lpcc_purge(self):
+ eprint("Stopping lpcc_purge...")
+ pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid
+ self._kill_process_by_pidfile(self.lpcc_purge_prog, pidfile)
+
+ def _dump_config(self):
+ eprint("========== Config ==========")
+ yaml.safe_dump(self.lpcc_config, sys.stdout, default_flow_style=False)
+ eprint("============================")
+
+ def start(self):
+ """
+ Start a PCC device and related services
+ """
+ eprint("Start PCC...")
+ self._dump_config()
+
+ retcode = self._add_pcc()
+ if retcode != 0:
+ return retcode
+
+ retcode = self._start_lpcc_purge()
+ if retcode != 0:
+ self._del_pcc()
+ return retcode
+
+ eprint("Done")
+ eprint()
+ return 0
+
+ def stop(self):
+ """
+ Start a PCC device and related services
+ """
+ eprint("Stop PCC...")
+ self._dump_config()
+
+ self._stop_lpcc_purge()
+ self._del_pcc()
+
+ eprint("Done")
+ eprint()
+ return 0
+
+ def status(self):
+ """
+ Get the status of PCC and service
+ """
+ result = {}
+ result['mount'] = self.lpcc_mount
+ result['cache'] = self.lpcc_cache
+
+ cmdline = ['lctl', 'pcc', 'list', self.lpcc_mount]
+ try:
+ output = subprocess.check_output(cmdline)
+ except subprocess.CalledProcessError as err:
+ result['status'] = "error"
+ result['error_msg'] = os.strerror(err.returncode)
+ return result
+
+ result['status'] = "stopped"
+ pcclist = yaml.load(output)
+ if pcclist is not None and 'pcc' in pcclist:
+ for pcc in pcclist['pcc']:
+ if pcc['pccpath'] == self.lpcc_cache:
+ result['status'] = "running"
+ result['roid'] = pcc['roid']
+ result['autocache'] = pcc['autocache']
+ break
+
+ if result['status'] != "running":
+ return result
+
+ # Now check lpcc_purge process
+ pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid
+ succ = LpccService._check_process_by_pidfile(self.lpcc_purge_prog, pidfile)
+ if succ:
+ result['purge'] = "running"
+ else:
+ result['purge'] = "stopped"
+ result['error_msg'] = "lpcc_purge is not running!"
+
+ return result
+
+ def is_running(self):
+ """
+ Check the status of PCC, return True if PCC is started, or False
+ """
+ pcc_status = self.status()
+ if pcc_status.get('status') == "running":
+ return True
+
+ return False
+
+ def is_stopped(self):
+ """
+ Check the status of PCC, return True if PCC is started, or False
+ """
+ pcc_status = self.status()
+ if pcc_status.get('status') == "stopped":
+ return True
+
+ return False
+
+
+class LpccMonitor:
+ """
+ Class to monitor mounted fs and start pcc if it's configurated
+ """
+
+ config_obj = None
+
+ def __init__(self, config_file):
+ try:
+ with open(config_file, "r") as file_handle:
+ self.config_obj = yaml.safe_load(file_handle)
+ # if config_obj is None, it means the config file is empty but still valid
+ if self.config_obj is None:
+ eprint("Config file '%s' is empty, the service won't do any real work!" % \
+ config_file)
+ self.config_obj = []
+ except FileNotFoundError:
+ # if config file does not exist, it's the same as an empty config file
+ eprint("Config file '%s' does not exist, the service won't do any real work!" % \
+ config_file)
+ self.config_obj = []
+ else:
+ if not self._check_config():
+ # None means invalid config file or information
+ self.config_obj = None
+
+ def _check_config(self):
+ if not isinstance(self.config_obj, list):
+ eprint("Config information is not valid!")
+ return False
+ return True
+
+ def _scan_start_pcc(self):
+ for lpcc_config in self.config_obj:
+ lpcc_service = LpccService(lpcc_config)
+
+ if bool(lpcc_config.get('disabled')):
+ continue
+ if not os.path.ismount(lpcc_config['mount']):
+ continue
+ if lpcc_service.is_stopped():
+ lpcc_service.start()
+
+ return 0
+
+ def _start_pcc(self, request):
+ count = 0
+ response = {}
+ mount = request.get('mount')
+ cache = request.get('cache')
+
+ for lpcc_config in self.config_obj:
+ if mount is not None and mount != lpcc_config['mount']:
+ continue
+ if cache is not None and cache != lpcc_config['cache']:
+ continue
+
+ count = count + 1
+ lpcc_service = LpccService(lpcc_config)
+ if 'disabled' in lpcc_config:
+ del lpcc_config['disabled']
+ if lpcc_service.is_stopped():
+ lpcc_service.start()
+
+ if count == 0 and mount is not None:
+ response['retcode'] = errno.ENOENT
+ response['error_msg'] = "No matched configuration for mount='%s' cache='%s'" \
+ % (mount, cache)
+ else:
+ response['retcode'] = 0
+
+ response['count'] = count
+ return response
+
+ def _stop_pcc(self, request):
+ count = 0
+ response = {}
+ mount = request.get('mount')
+ cache = request.get('cache')
+
+ for lpcc_config in self.config_obj:
+ if mount is not None and mount != lpcc_config['mount']:
+ continue
+ if cache is not None and cache != lpcc_config['cache']:
+ continue
+
+ count = count + 1
+ lpcc_service = LpccService(lpcc_config)
+ if not lpcc_service.is_stopped():
+ lpcc_service.stop()
+ if not bool(request.get('keep-enabled')):
+ lpcc_config['disabled'] = True
+
+ if count == 0 and mount is not None:
+ response['retcode'] = errno.ENOENT
+ response['error_msg'] = "No matched configuration for mount='%s' cache='%s'" \
+ % (mount, cache)
+ else:
+ response['retcode'] = 0
+
+ response['count'] = count
+ return response
+
+ def _stop_all_pcc(self):
+ request = {}
+ request['action'] = 'stop-all'
+ return self._stop_pcc(request)
+
+ def _status_pcc(self, request):
+ response = {}
+ mount = request.get('mount')
+ cache = request.get('cache')
+
+ status_list = []
+ for lpcc_config in self.config_obj:
+ if mount is not None and mount != lpcc_config['mount']:
+ continue
+ if cache is not None and cache != lpcc_config['cache']:
+ continue
+
+ lpcc_service = LpccService(lpcc_config)
+ lpcc_status = lpcc_service.status()
+ if bool(lpcc_config.get('disabled')):
+ lpcc_status['disabled'] = True
+ status_list.append(lpcc_status)
+
+ response['retcode'] = 0
+ response['status_list'] = status_list
+ return response
+
+ def _process_cmd(self, request):
+ response = {}
+
+ if request['action'] == "start" or request['action'] == "start-all":
+ response = self._start_pcc(request)
+ elif request['action'] == "stop" or request['action'] == "stop-all":
+ response = self._stop_pcc(request)
+ elif request['action'] == 'status' or request['action'] == 'status-all':
+ response = self._status_pcc(request)
+ else:
+ response['retcode'] = -1
+
+ response['request'] = request
+ return response
+
+ def _serve_cmd(self):
+ try:
+ conn, _ = LISTEN_SOCK.accept()
+ request_str = conn.makefile().readline()
+ request = json.loads(request_str)
+ except Exception as ex:
+ eprint(ex)
+
+ eprint("Request:", request)
+ response = self._process_cmd(request)
+ eprint("Response:", response)
+
+ try:
+ conn.send(bytes(json.dumps(response), encoding='utf-8'))
+ conn.close()
+ except Exception as ex:
+ eprint(ex)
+
+ def run(self):
+ """
+ Start monitor daemon, scan and start PCC in config file,
+ monitor /proc/self/mounts and listen on command socket
+ """
+ mounts_fh = open("/proc/self/mounts", "r")
+
+ if os.path.exists(LISTEN_SOCK_FN):
+ os.unlink(LISTEN_SOCK_FN)
+
+ global LISTEN_SOCK
+ LISTEN_SOCK = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ LISTEN_SOCK.bind(LISTEN_SOCK_FN)
+ LISTEN_SOCK.listen(1)
+
+ self._scan_start_pcc()
+
+ while True:
+ try:
+ rset, _, eset = select.select([LISTEN_SOCK], [], [mounts_fh])
+ except OSError:
+ break
+ except ValueError:
+ break
+
+ if LISTEN_SOCK in rset:
+ self._serve_cmd()
+ if mounts_fh in eset:
+ self._scan_start_pcc()
+
+ eprint("Do cleaning...")
+ self._stop_all_pcc()
+ mounts_fh.close()
+ LISTEN_SOCK.close()
+ os.unlink(LISTEN_SOCK_FN)
+
+ return 0
+
+
+class LpccCli:
+ """
+ Class to get command from cli, communicate with monitor,
+ and show result
+ """
+
+ def __init__(self):
+ pass
+
+ def run_cmd(self, cmd):
+ """
+ Communicate with server and run a sub command
+ """
+ try:
+ sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+ sock.connect(LISTEN_SOCK_FN)
+ except FileNotFoundError:
+ eprint("Socket file '%s' does not exist, " % LISTEN_SOCK_FN + \
+ "please check whether the monitor service started!")
+ sys.exit(1)
+
+ sock.sendall(bytes(json.dumps(cmd), encoding='utf-8'))
+ sock.shutdown(socket.SHUT_WR)
+ response = sock.makefile().readline()
+ sock.close()
+
+ return json.loads(response)
+
+
+def sigint_handler(signum, frame):
+ """
+ SIGINT handler
+ """
+ #pylint: disable=unused-argument
+ # close the listen socket to notify the monitor service to exit
+ eprint("Received signal %s" % signal.Signals(signum).name)
+ LISTEN_SOCK.close()
+
+def main():
+ """
+ main function
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--config-file', default='/etc/lpcc.conf',
+ help='specify the config file')
+ subparsers = parser.add_subparsers(dest='action')
+ subparsers.add_parser('monitor', help='start the monitor process')
+
+ start_parser = subparsers.add_parser('start', help=\
+ 'start one LPCC of specfied lustre file system and cache dir,' +\
+ 'or all LPCCs based on specified lustre file system')
+ start_parser.add_argument('mount', nargs=1, help=\
+ 'the mount point of lustre file system')
+ start_parser.add_argument('cache', nargs='?', help=\
+ 'the cache dir of LPCC')
+
+ stop_parser = subparsers.add_parser('stop', help=\
+ 'stop one LPCC of specfied lustre file system and cache dir, ' +\
+ 'or all LPCCs based on specified lustre file system')
+ stop_parser.add_argument('mount', nargs=1, help=\
+ 'the mount point of lustre file system')
+ stop_parser.add_argument('cache', nargs='?', help=\
+ 'the cache dir of LPCC')
+ stop_parser.add_argument('--keep-enabled', action='store_true', help=\
+ 'keep the LPCC enabled, which means when the lustre file system is ' +\
+ 'mounted again, start all LPCCs based on it')
+
+ status_parser = subparsers.add_parser('status', help=\
+ 'get the status of one LPCC of specfied lustre file system and ' +\
+ 'cache dir, or all LPCCs based on specified lustre file system')
+ status_parser.add_argument('mount', nargs=1, help=\
+ 'the mount point of lustre file system')
+ status_parser.add_argument('cache', nargs='?', help=\
+ 'the cache dir of LPCC')
+
+ subparsers.add_parser('start-all', help='start all LPCCs')
+ subparsers.add_parser('stop-all', help='stop all LPCCs')
+ subparsers.add_parser('status-all', help='get the status of all LPCCs')
+
+ args = parser.parse_args()
+
+ if args.action == 'monitor':
+ signal.signal(signal.SIGINT, sigint_handler)
+ signal.signal(signal.SIGTERM, sigint_handler)
+
+ monitor = LpccMonitor(args.config_file)
+ if monitor.config_obj is None:
+ return 1
+
+ try:
+ retcode = monitor.run()
+ finally:
+ if os.path.exists(LISTEN_SOCK_FN):
+ os.unlink(LISTEN_SOCK_FN)
+ return retcode
+
+ if args.action == 'start' or args.action == 'stop':
+ request = {}
+ request['action'] = args.action
+ request['mount'] = args.mount[0]
+ request['cache'] = args.cache
+ if getattr(args, 'keep_enabled', False):
+ request['keep-enabled'] = True
+
+ response = LpccCli().run_cmd(request)
+
+ print(response)
+ return response['retcode']
+
+ if args.action == 'start-all' or args.action == 'stop-all':
+ request = {}
+ request['action'] = args.action
+
+ response = LpccCli().run_cmd(request)
+
+ print(response)
+ return response['retcode']
+
+ if args.action == 'status':
+ request = {}
+ request['action'] = args.action
+ request['mount'] = args.mount[0]
+ request['cache'] = args.cache
+
+ response = LpccCli().run_cmd(request)
+
+ print(json.dumps(response['status_list'], indent=4))
+ return response['retcode']
+
+ if args.action == 'status-all':
+ request = {}
+ request['action'] = args.action
+ response = LpccCli().run_cmd(request)
+
+ print(json.dumps(response['status_list'], indent=4))
+ return response['retcode']
+
+ eprint("Type 'lpcc -h' for more information.")
+ return 1
+
if __name__ == "__main__":
- lpcc.main()
+ ret = main()
+ sys.exit(ret)
-# Configuration file of Lustre Persistent Client Cache Management
-#
-# Configuration Guide:
-#
-# $fsname:
-# File system name of Lustre
-#
-# $ssh_hosts:
-# $ssh_hosts includes the informations of logining to the server hosts using
-# SSH connections. $host_id is the unique ID of the host. Two hosts shouldn't
-# share a same $host_id. $hostname is the host name to use when connecting to
-# the host using SSH. $host_id and $hostname could be different, because there
-# could multiple ways to connect to the same host. $ssh_identity_file is the
-# SSH key file used when connecting to the host. $ssh_identity_file could be
-# omitted if the default SSH identity file works.
-#
-# $mds_hosts:
-# $mds_hosts includes all the hosts that could be running MDT of this file
-# system. Multiple hosts can be configured to support failover.
-# "lctl set_param" commands will be run on the MDT to configure the system
-# properly for HSM.
-#
-# $lpcc_readwrite_datasets:
-# $lpcc_readwrite_datasets includes all the clients that needs to enable
-# readwrite LPCC. $host_id is the host with this client. $archive_id is the
-# HSM archive ID reserved for this client. $lpcc_root is the path of the LPCC
-# root directory, usually a mounted local file system on SSD.
-# $lustre_mount_point is the Lustre client mount point. $client_id is a unique
-# ID of the LPCC client. Two LPCC clients shouldn't share a same $client_id.
-#
-fsname: 969362ae # File system name of Lustre
-ssh_hosts: # Array of hosts
- - host_id: server17-el7-vm1 # ID of this SSH host
- hostname: server17-el7-vm1 # The host name
- ssh_identity_file: /root/.ssh/id_dsa # The SSH key to connect to the host
- - host_id: server17-el7-vm2
- hostname: server17-el7-vm2
- ssh_identity_file: /root/.ssh/id_dsa
- - host_id: server17-el7-vm3
- hostname: server17-el7-vm3
- ssh_identity_file: /root/.ssh/id_dsa
-mds_hosts: # Array of hosts that could have MDTs
- - host_id: server17-el7-vm1 # ID of the host running MDS
-lustre_clients:
- - host_id: server17-el7-vm2 # ID of the host running client
- lustre_mount_point: /mnt/lustre # Lustre mount point
- client_id: server17-el7-vm2 # ID of this Lustre client
- - host_id: server17-el7-vm3
- lustre_mount_point: /mnt/lustre
- client_id: server17-el7-vm3
-lpcc_readwrite_datasets: # Array of client with readwrite LPCC
- - client_id: server17-el7-vm2 # ID of the Lustre client
- archive_id: 1 # Archive number
- lpcc_root: /mnt/lpcc # LPCC root path
- project_id: 100 # Project ID of new files for automatic caching
- dataset_id: server17-el7-vm2_lpcc_rw # ID of this LPCC dataset
- - client_id: server17-el7-vm3
- archive_id: 2
- lpcc_root: /mnt/lpcc
- project_id: 101
- dataset_id: server17-el7-vm3_lpcc_rw
-lpcc_readonly_dataset_groups: # Array of client with readonly LPCC
- - group_id: 3 # Group ID
- lpcc_root: /mnt/lpcc_ro_g3 # LPCC root path
- project_id: 10 # Project ID of new files for automatic caching
- lustre_clients: # Array of Lustre clients to enable this dataset
- - client_id: server17-el7-vm2 # ID of Lustre client
- - client_id: server17-el7-vm3
- - group_id: 4
- lpcc_root: /mnt/lpcc_ro_g4
- lustre_mount_point: /mnt/lustre
- project_id: 11
- lustre_clients:
- - client_id: server17-el7-vm2
- - client_id: server17-el7-vm3
+#- mount: /mnt/lfs
+# cache: /mnt/pcc
+# roid: 2
+# autocache: projid={500 1000}&fname={*.h5},uid={1001}
+# purge:
+# high_usage: 90
+# low_usage: 75
+# scan_threads: 1
+# interval: 30
+#
+#- mount: /mnt/lfs
+# cache: /mnt/pcc2
+# roid: 3
+# autocache: projid={500}
+++ /dev/null
-#!/usr/bin/python2 -u
-# Copyright (c) 2017 DataDirect Networks, Inc.
-# All Rights Reserved.
-# Author: lixi@ddn.com
-"""
-LPCC(Lustre Persistent Client Cache) Cleanup
-"""
-from pylpcc import lpcc_cleanup
-
-if __name__ == "__main__":
- lpcc_cleanup.main()
+++ /dev/null
-#!/usr/bin/python2 -u
-# Copyright (c) 2017 DataDirect Networks, Inc.
-# All Rights Reserved.
-# Author: lixi@ddn.com
-"""
-Tests for LPCC(Lustre Persistent Cache Management)
-"""
-from pylpcc import lpcc_test
-
-if __name__ == "__main__":
- lpcc_test.main()
--- /dev/null
+.\" -*- nroff -*-
+.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved.
+.\" This file may be copied under the terms of the GNU Public License, v2.
+.\"
+.TH lpcc-start 8 "2021 Jul 7" Lustre "configuration utilities"
+
+.SH NAME
+lpcc-start - lpcc start sub command
+
+.SH SYNOPSIS
+.BI "lpcc start MOUNT_POINT [CACHE_DIR]"
+.PP
+.BI "lpcc start-all"
+.PP
+
+.SH DESCRIPTION
+Start a specific LPCC if both \fBMOUNT_POINT\fR and \fBCAHCE_DIR\fR
+are specified.
+Start all LPCCs based on a specific Lustre file system if only
+\fBMOUNT_POINT\fR are specified.
+\fBstart_all\fR sub command starts all LPCCs in config file.
+.PP
+.SH "SEE ALSO"
+.BR lpcc(8)
+.BR lpcc-stop(8)
+.BR lpcc-status(8)
--- /dev/null
+.\" -*- nroff -*-
+.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved.
+.\" This file may be copied under the terms of the GNU Public License, v2.
+.\"
+.TH lpcc-status 8 "2021 Jul 7" Lustre "configuration utilities"
+
+.SH NAME
+lpcc-status - lpcc status sub command
+
+.SH SYNOPSIS
+.BI "lpcc status MOUNT_POINT [CACHE_DIR]"
+.PP
+.BI "lpcc status-all"
+.PP
+
+.SH DESCRIPTION
+Get status of a specific LPCC if both \fBMOUNT_POINT\fR and \fBCAHCE_DIR\fR
+are specified.
+Get status all LPCCs based on a specific Lustre file system if only
+\fBMOUNT_POINT\fR are specified.
+\fBstatus_all\fR sub command get the status of all LPCCs in config file.
+.PP
+.SH "SEE ALSO"
+.BR lpcc(8)
+.BR lpcc-start(8)
+.BR lpcc-stop(8)
--- /dev/null
+.\" -*- nroff -*-
+.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved.
+.\" This file may be copied under the terms of the GNU Public License, v2.
+.\"
+.TH lpcc-stop 8 "2021 Jul 7" Lustre "configuration utilities"
+
+.SH NAME
+lpcc-stop - lpcc stop sub command
+
+.SH SYNOPSIS
+.BI "lpcc stop MOUNT_POINT [CACHE_DIR] [OPTIONS]"
+.PP
+.BI "lpcc stop-all"
+.PP
+
+.SH DESCRIPTION
+Stop a specific LPCC if both \fBMOUNT_POINT\fR and \fBCAHCE_DIR\fR
+are specified.
+Stop all LPCCs based on a specific Lustre file system if only
+\fBMOUNT_POINT\fR are specified.
+\fBstop_all\fR sub command stops all LPCCs in config file.
+.PP
+.SH OPTIONS
+.TP
+.BR --keep-enabled
+used by
+.BR umount.lustre (8)
+to notify monitor daemon that the LPCC should be started again if the lustre
+file system is mounted again.
+
+.SH "SEE ALSO"
+.BR lpcc(8)
+.BR lpcc-start(8)
+.BR lpcc-status(8)
--- /dev/null
+.\" -*- nroff -*-
+.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved.
+.\" This file may be copied under the terms of the GNU Public License, v2.
+.\"
+.TH lpcc 8 "2021 Jul 7" Lustre "configuration utilities"
+
+.SH NAME
+lpcc - Management tool for Lustre Persistent Client Cache (LPCC)
+
+.SH SYNOPSIS
+.BI "lpcc -h|--help"
+.PP
+.BI "lpcc SUBCMD ARGS"
+.PP
+
+.SH DESCRIPTION
+To start/stop Lustre Persistent Client Cache (LPCC), there is a series of
+commands to be run correctly with consistent parameters. If there are multiple
+LPCCs on a client, it is even more complex.
+.PP
+.TP
+The \fBlpcc\fR tool helps to:
+.br
+\(bu configurate all the LPCCs in single file
+.br
+\(bu start/stop LPCCs automatically when system boots up/shutdown
+.br
+\(bu monitor the mounting/umounting of lustre file system and start/stop
+LPCCs based on the file system
+.br
+\(bu start/stop specific LPCC manually
+
+.PP
+To use \fBlpcc\fR tool, first prepare a configuration file.
+The file is \fB/etc/lpcc.conf\fR by default.
+.PP
+Then start the monitor daemon. Usually this work is done by a wrapper
+systemd service \fBlpcc.service\fR.
+It is highly recommended to enable the service so that it is started
+automatically when the system boots up.
+.PP
+When the monitor daemon starts, it checks all the LPCCs. If the base lustre
+file system of any LPCC has been mounted, the LPCC will be started
+automatically.
+.PP
+If a lustre file system is mounted later, the monitor daemon checks any LPCC
+based on that file system and starts it. If a lustre file system is unmounted,
+\fBumount.lustre (8)\fR checks any LPCC based on that file system, and stops
+it before doing the real umounting.
+.PP
+While the monitor daemon is running, user can manually start/stop one specific
+LPCC by \fImount_point\fR and \fIcache_dir\fR, or all LPCCs based on a
+specific \fImount_point\fR.
+.PP
+All these LPCCs will be stopped when the monitor daemon stops.
+.SH "SEE ALSO"
+.BR lpcc.conf(5)
+.BR lpcc-start(8)
+.BR lpcc-stop(8)
+.BR lpcc-status(8)
+.BR lctl-pcc(8)
--- /dev/null
+.\" -*- nroff -*-
+.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved.
+.\" This file may be copied under the terms of the GNU Public License, v2.
+.\"
+.TH lpcc.conf 5 "2021 Jul 7" Lustre "File Formats Manual"
+
+.SH NAME
+lpcc.conf - configuration file for lpcc systemd service
+
+.SH DESCRIPTION
+The file \fB/etc/lpcc.conf\fR contains a list of Lustre Persistent Client Cache
+(LPCC). The whole file is an array in YAML. Each element of the array
+is the configuration of a LPCC.
+For each LPCC, the configuration is a dictionary with these items:
+.PP
+.TP
+.BR mount
+The mount point of lustre file system to be cached
+.TP
+.BR cache
+The dir for cached file
+.TP
+.BR roid
+The id of LPCC. It is a positive interger and must be unique on a single client.
+.TP
+.BR autocache
+The condition to cache file automatically.
+.TP
+.BR purge
+More configuration for lpcc_purge daemon. Since all the sub items under it have
+default value, this item is not necessary if it has no explicit sub item.
+.TP
+.BR purge.high_usage
+If the disk usage of cache device is higher than \fBpurge.high_usage\fR, start
+purging. It is 90 (means 90% disk/inode useage) by default.
+.TP
+.BR purge.low_usage
+If the disk usage of cache device is lower than \fBpurge.low_usage\fR, stop
+purging. It is 75 (means 75% disk/inode usage) by default.
+.TP
+.BR purge.interval
+The interval for lpcc_purge to check cache device usage, in seconds. It is 5
+seconds by default.
+.TP
+.BR purge.scan_threads
+How many threads are used to scan cache device in parallel. It is 1 thread by
+default.
+
+.SH AUTOCACHE CONDITION
+When a file in lustre file system is opened, the autocache condition will be
+checked against the file. If the condition is true, the file will be cached in
+the cache device automatically.
+.PP
+The rule is either a single compare expression, or several compare expressions
+connected with '&' or ','. Here '&' is logical opearator AND, ',' is logical
+operator OR. '&' has a higher priority than ','.
+.PP
+Each compare expression has 3 parts: attribute, opeartor and target.
+Attribute is the attribute of file to be checked, e.g., projid, fname.
+Operator is '=', '<' or '>'. Target is either a single value or a value group
+(several single values separated by blankspace ' '), embraced by a pair of '{}'.
+.PP
+For example:
+.PP
+.TP
+.BR projid={500}
+projid is exactly 500.
+.TP
+.BR "projid={500 1000}"
+projid is either 500 or 1000.
+.TP
+.BR "projid={500 1000}&fname={*.h5},uid={1001}"
+Either case of:
+(a) projid is 500 or 1000 and fname matches *.h5;
+(b) process uid is exactly 1001.
+.PP
+These attributes are supported in compare expression:
+.PP
+.TP
+.BR projid
+The project id of file. It supports '=', '<' and '>' operators. '=' operator
+supports single value or value group. '<' and '>' support only single value.
+Each single value should be a number.
+.TP
+.BR fname
+The base name of file. It supports only '=' operator.
+Both single value and value group are supported.
+Each single value should be a precise file name string, or a pattern including
+wildchar '*'.
+.TP
+.BR uid
+The uid of the process to access the file. It supports '=', '<' and '>'
+operators. '=' operator supports single value or value group. '<' and '>'
+support only single value.
+Each single value should be a number.
+.TP
+.BR gid
+The gid of process to access the file. Similar to uid.
+.TP
+.BR size
+The size of file. It supports '=', '<' and '>' operators. '=' operator supports
+single value or value group. '<' and '>' support only single value.
+Each single value should be a number, or a number with unit.
+The unit could be: K, M, G, T, P, E.
+.TP
+.BR mtime
+The mtime of file. Actually it means the age, that is to say, the seconds of
+(current - mtime). It supports '=', '<' and '>' operators. '=' operator supports
+single value or value group. '<' and '>' support only single value.
+Each single value should be a number in seconds, or a number wiht unit.
+The unit could be: m(minute), h(hour), d(day), w(week), y(year). Here 1 year is
+exactly 52 weeks.
+For example, "mtime>{10m}" means the file was modified more than 10 minutes ago;
+"mtime<{1h30}" means the file was modified less than 1 hour and 30 seconds ago.
+
+.SH EXAMPLES
+.EX
+# sample /etc/lpcc.conf
+- mount: /mnt/lfs
+ cache: /mnt/pcc
+ roid: 2
+ autocache: projid={500 1000}&fname={*.h5},uid={1001}
+ purge:
+ high_usage: 85
+ low_usage: 70
+ scan_threads: 3
+ interval: 10
+- mount: /mnt/lfs2
+ cache: /mnt/pcc2
+ roid: 3
+ autocache: projid={500}
+.EE
+.SH "SEE ALSO"
+.BR lpcc(8)
+.BR lpcc-start(8)
+.BR lpcc-stop(8)
+.BR lpcc-status(8)
\ No newline at end of file
+++ /dev/null
-"""
-Python library for LPCC
-"""
-__all__ = ["lpcc",
- "lpcc_cleanup",
- "lpcc_test"]
+++ /dev/null
-# Copyright (c) 2017 DataDirect Networks, Inc.
-# All Rights Reserved.
-# Author: lixi@ddn.com
-
-"""
-Library for managing LPCC(Lustre Persistent Client Cache)
-"""
-
-import sys
-import traceback
-import os
-import shutil
-import signal
-import time
-import re
-import filelock
-import yaml
-
-# Local libs
-from pylustre import lustre
-from pylustre import time_util
-from pylustre import utils
-from pylustre import daemon
-from pylustre import hsm
-from pylustre import clog
-
-LPCC_CONFIG_FNAME = "lpcc.conf"
-LPCC_CONFIG = "/etc/" + LPCC_CONFIG_FNAME
-LPCC_LOG_DIR = "/var/log/lpcc"
-LPCC_STATE_PATTERN = (r"^type: (?P<pcc_type>\S+), "
- r"PCC file: (?P<pcc_file>\S+), "
- r"user number: (?P<user_number>\S+), "
- r"attr cached: (?P<attr_cached>\S+)$")
-LPCC_STATE_REGULAR = re.compile(LPCC_STATE_PATTERN)
-LPCC_TYPE_NONE = "none"
-LPCC_TYPE_READONLY = "readonly"
-LPCC_TYPE_READWRITE = "readwrite"
-
-STR_SSH_HOSTS = "ssh_hosts"
-STR_HOSTNAME = "hostname"
-STR_HOST_ID = "host_id"
-STR_SSH_IDENTITY_FILE = "ssh_identity_file"
-STR_MDS_HOSTS = "mds_hosts"
-STR_LUSTRE_CLIENTS = "lustre_clients"
-STR_LUSTRE_MOUNT_POINT = "lustre_mount_point"
-STR_CLIENT_ID = "client_id"
-STR_LPCC_READWRITE_DATASETS = "lpcc_readwrite_datasets"
-STR_DATASET_ID = "dataset_id"
-STR_LPCC_ROOT = "lpcc_root"
-STR_GROUP_ID = "group_id"
-STR_ROOT = "root"
-STR_ARCHIVE_ID = "archive_id"
-STR_PROJECT_ID = "project_id"
-STR_LPCC_READONLY_DATASET_GROUPS = "lpcc_readonly_dataset_groups"
-
-
-def usage():
- """
- Print usage string
- """
- utils.oprint("Usage: %s <config_file>" % sys.argv[0])
-
-
-class LPCCDataset(object):
- """
- Each SSH host has an object of this type
- """
- # pylint: disable=too-few-public-methods,too-many-arguments,too-many-instance-attributes
- def __init__(self, client_name, pcc_type, root, set_id, projid,
- lustre_client):
- self.lpccd_lustre_client = lustre_client
- self.lpccd_host = lustre_client.lc_host
- self.lpccd_id = set_id
- self.lpccd_projid = projid
- self.lpccd_root = root
- self.lpccd_client_name = client_name
- self.lpccd_pcc_type = pcc_type
- self.lpccd_lustre_mnt = lustre_client.lc_mnt
-
- def lpccd_stop(self, log):
- """
- Delete this dataset from LPCC
- """
- command = ("echo -n 'del %s' > /proc/fs/lustre/llite/%s/pcc" %
- (self.lpccd_root, self.lpccd_client_name))
- retval = self.lpccd_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- self.lpccd_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
- return 0
-
- def lpccd_start(self, log):
- """
- Add this dataset to LPCC
- """
- command = ("echo -n 'add %s %s %s' > /proc/fs/lustre/llite/%s/pcc" %
- (self.lpccd_root, self.lpccd_id, self.lpccd_projid,
- self.lpccd_client_name))
- retval = self.lpccd_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- self.lpccd_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
- return 0
-
-
-class LPCCRwDataset(LPCCDataset):
- """
- Each readwrite PCC client has an object of this type
- """
- # pylint: disable=too-few-public-methods,too-many-instance-attributes
- # pylint: disable=too-many-arguments
- def __init__(self, log, dataset_id, archive_id, project_id, lrwd_root,
- parent_directory, fsname, have_raolu, mdts, client_name,
- lustre_client):
- super(LPCCRwDataset, self).__init__(client_name,
- LPCC_TYPE_READWRITE,
- lrwd_root, archive_id,
- project_id,
- lustre_client)
- self.lrwd_workspace = parent_directory + "/" + dataset_id
- self.lrwd_copytool = hsm.HSMCopytool("copytool", lustre_client.lc_host, archive_id,
- lrwd_root, lustre_client.lc_mnt,
- self.lrwd_workspace)
- self.lrwd_parent_directory = parent_directory
- self.lrwd_have_raolu = have_raolu
- self.lrwd_removers = []
- if not have_raolu:
- for mdt in mdts:
- remover_id = "remover_" + mdt.ls_index_string
- remover = hsm.HSMRemover(log, remover_id, lustre_client.lc_host, fsname, mdt,
- lrwd_root, self.lrwd_workspace)
- self.lrwd_removers.append(remover)
-
- def lrwd_killall(self, log):
- """
- Kill all the process of this LPCC client
- """
- self.lrwd_copytool.hc_killall(log)
- for remover in self.lrwd_removers:
- remover.hr_killall(log)
- return 0
-
- def lpccd_stop(self, log):
- """
- Stop all the process of this LPCC client
- """
- self.lrwd_killall(log)
- self.lrwd_copytool.hc_thread.join()
- for remover in self.lrwd_removers:
- remover.hr_thread.join()
- remover.hr_fini(log)
- return super(LPCCRwDataset, self).lpccd_stop(log)
-
- def lpccd_start(self, log):
- """
- Start all the process of this LPCC client
- """
- ret = utils.mkdir(self.lrwd_workspace)
- if ret:
- log.cl_error("failed to create directory [%s] on local host",
- self.lrwd_workspace)
- return -1
-
- self.lrwd_killall(log)
- ret = super(LPCCRwDataset, self).lpccd_start(log)
- if ret:
- log.cl_error("failed to add dataset [%s] of mnt [%s] on host "
- "[%s]", self.lpccd_root, self.lpccd_host.sh_hostname,
- self.lpccd_lustre_mnt)
- return -1
-
- for remover in self.lrwd_removers:
- ret = remover.hr_thread_start(log)
- if ret:
- log.cl_error("failed to start remover thread")
- return -1
- ret = self.lrwd_copytool.hc_thread_start(log)
- if ret:
- log.cl_error("failed to start copytool thread for dataset [%s] of "
- "mnt [%s] on host [%s]", self.lpccd_root,
- self.lpccd_host.sh_hostname, self.lpccd_lustre_mnt)
- return -1
-
- return 0
-
-
-def find_lpcc_dataset_from_id(datasets, set_id):
- """
- Find LPCC from archive ID
- """
- # pylint: disable=unused-variable
- for dataset in datasets:
- if dataset.lpccd_id == set_id:
- return dataset
- return None
-
-
-class LPCCManager(object):
- """
- Each SSH host has an object of this type
- """
- # pylint: disable=too-few-public-methods,too-many-arguments,too-many-instance-attributes
- def __init__(self, workspace, config_fpath):
- self.lm_rw_dataset_dict = {}
- self.lm_rw_datasets = []
- self.lm_ro_dataset_groups = {}
- self.lm_ro_datasets = []
- self.lm_lustre_clients = {}
- self.lm_workspace = workspace
- self.lm_config_fpath = config_fpath
- self.lm_fsname = None
- self.lm_hosts = {}
- self.lm_mdt_hosts = []
-
- def lm_parse(self, log):
- """
- Parse the configuration
- """
- # pylint: disable=bare-except,too-many-locals,too-many-return-statements
- # pylint: disable=too-many-branches,too-many-statements,unused-variable
- config_fd = open(self.lm_config_fpath)
- ret = 0
- try:
- config = yaml.load(config_fd)
- except:
- log.cl_error("not able to load [%s] as yaml file: %s",
- self.lm_config_fpath, traceback.format_exc())
- ret = -1
- config_fd.close()
- if ret:
- return -1
-
- fsname = config["fsname"]
- host_configs = config[STR_SSH_HOSTS]
- for host_config in host_configs:
- hostname = host_config[STR_HOSTNAME]
- host_id = host_config[STR_HOST_ID]
- if STR_SSH_IDENTITY_FILE in host_config:
- ssh_identity_file = host_config[STR_SSH_IDENTITY_FILE]
- else:
- ssh_identity_file = None
- if host_id in self.lm_hosts:
- log.cl_error("multiple hosts with the same ID [%s]", host_id)
- return -1
- host = lustre.LustreServerHost(hostname,
- identity_file=ssh_identity_file,
- host_id=host_id)
- self.lm_hosts[host_id] = host
-
- mds_configs = config[STR_MDS_HOSTS]
- for mds_config in mds_configs:
- mds_host_id = mds_config[STR_HOST_ID]
- if mds_host_id not in self.lm_hosts:
- log.cl_error("no host with ID [%s] is configured", host_id)
- return -1
- host = self.lm_hosts[mds_host_id]
- self.lm_mdt_hosts.append(host)
-
- have_raolu = True
- mdts = []
- for host in self.lm_mdt_hosts:
- tmp_clients = {}
- tmp_osts = {}
- tmp_mdts = {}
- ret = host.lsh_lustre_detect_services(tmp_clients, tmp_osts, tmp_mdts)
- if ret:
- log.cl_error("failed to detect services on host [%s]",
- host.sh_hostname)
- return -1
- for mdt_index, mdt in tmp_mdts.iteritems():
- if mdt.lsi_service.ls_lustre_fs.lf_fsname != fsname:
- continue
- mdts.append(mdt)
- ret = mdt.mdti_enable_hsm_control(log)
- if ret:
- return -1
-
- ret = mdt.mdti_enable_raolu(log)
- if ret < 0:
- return -1
- elif ret == 1:
- have_raolu = False
-
- lustre_client_configs = config[STR_LUSTRE_CLIENTS]
- for lustre_client_config in lustre_client_configs:
- host_id = lustre_client_config[STR_HOST_ID]
- if host_id not in self.lm_hosts:
- log.cl_error("no host with ID [%s] is configured", host_id)
- return -1
- host = self.lm_hosts[host_id]
-
- lustre_mount_point = lustre_client_config[STR_LUSTRE_MOUNT_POINT]
- client_id = lustre_client_config[STR_CLIENT_ID]
- if client_id in self.lm_lustre_clients:
- log.cl_error("multiple Lustre client with the same ID [%s]",
- client_id)
- return -1
- lustre_fs = lustre.LustreFilesystem(fsname)
- lustre_client = lustre.LustreClient(log, lustre_fs, host, lustre_mount_point)
- self.lm_lustre_clients[client_id] = lustre_client
-
- if STR_LPCC_READWRITE_DATASETS in config:
- lpcc_rw_dataset_configs = config[STR_LPCC_READWRITE_DATASETS]
- else:
- lpcc_rw_dataset_configs = []
-
- for lpcc_rw_dataset_config in lpcc_rw_dataset_configs:
- client_id = lpcc_rw_dataset_config[STR_CLIENT_ID]
- if client_id not in self.lm_lustre_clients:
- log.cl_error("no Lustre client with ID [%s] is configured", client_id)
- return -1
- lustre_client = self.lm_lustre_clients[client_id]
- lustre_mount_point = lustre_client.lc_mnt
-
- host = lustre_client.lc_host
- dataset_id = lpcc_rw_dataset_config[STR_DATASET_ID]
- if dataset_id in self.lm_rw_dataset_dict:
- log.cl_error("multiple LPCC client with the same ID [%s]",
- dataset_id)
- return -1
- lpcc_root = lpcc_rw_dataset_config[STR_LPCC_ROOT]
- archive_id = lpcc_rw_dataset_config[STR_ARCHIVE_ID]
- lpcc_rw_dataset = find_lpcc_dataset_from_id(self.lm_rw_datasets, archive_id)
- if lpcc_rw_dataset is not None:
- log.cl_error("multiple LPCC client with the same archive ID [%s]",
- archive_id)
- return -1
-
- project_id = lpcc_rw_dataset_config[STR_PROJECT_ID]
-
- client_name = host.lsh_getname(log, lustre_mount_point)
- if client_name is None:
- log.cl_error("failed to get client name of path [%s] on host "
- "[%s]", lustre_mount_point, host.sh_hostname)
- return -1
- if not client_name.startswith(fsname + "-"):
- log.cl_error("client name [%s] of path [%s] on host [%s] "
- "doesn't have expected fsname [%s] ", client_name,
- lustre_mount_point, host.sh_hostname, fsname)
- return -1
- lpcc_rw_dataset = LPCCRwDataset(log, dataset_id, archive_id, project_id,
- lpcc_root, self.lm_workspace,
- fsname, have_raolu, mdts, client_name,
- lustre_client)
- self.lm_rw_dataset_dict[dataset_id] = lpcc_rw_dataset
- self.lm_rw_datasets.append(lpcc_rw_dataset)
-
- if STR_LPCC_READONLY_DATASET_GROUPS in config:
- group_configs = config[STR_LPCC_READONLY_DATASET_GROUPS]
- else:
- group_configs = []
- for group_config in group_configs:
- lpcc_root = group_config[STR_LPCC_ROOT]
- group_id = group_config[STR_GROUP_ID]
- if group_id in self.lm_ro_dataset_groups:
- log.cl_error("multiple LPCC readonly group with the same group ID [%s]",
- group_id)
- return -1
-
- project_id = group_config[STR_PROJECT_ID]
- client_configs = group_config[STR_LUSTRE_CLIENTS]
- group_datasets = []
- for client_config in client_configs:
- client_id = client_config[STR_CLIENT_ID]
- if client_id not in self.lm_lustre_clients:
- log.cl_error("no Lustre client with ID [%s] is configured", client_id)
- return -1
- lustre_client = self.lm_lustre_clients[client_id]
-
- fsname = lustre_client.lc_lustre_fs.lf_fsname
- host = lustre_client.lc_host
- lustre_mount_point = lustre_client.lc_mnt
- client_name = host.lsh_getname(log, lustre_mount_point)
- if client_name is None:
- log.cl_error("failed to get client name of path [%s] on host "
- "[%s]", lustre_mount_point, host.sh_hostname)
- return -1
- if not client_name.startswith(fsname + "-"):
- log.cl_error("client name [%s] of path [%s] on host [%s] "
- "doesn't have expected fsname [%s] ", client_name,
- lustre_mount_point, host.sh_hostname, fsname)
- return -1
-
- dataset = LPCCDataset(client_name, LPCC_TYPE_READONLY,
- lpcc_root, group_id, project_id,
- lustre_client)
- self.lm_ro_datasets.append(dataset)
- group_datasets.append(dataset)
- self.lm_ro_dataset_groups[group_id] = group_datasets
-
- def lm_start(self, log):
- """
- Start LPCC manager
- """
- # pylint: disable=unused-variable
- for host_id, host in self.lm_hosts.iteritems():
- clients = lustre.detect_lustre_clients(log, host)
- for client in clients:
- ret = lpcc_dataset_stop(log, host, client.lc_mnt)
- if ret:
- return -1
-
- for dataset in self.lm_rw_datasets:
- ret = dataset.lpccd_start(log)
- if ret:
- log.cl_error("failed to start readwrite dataset")
- return -1
-
- for dataset in self.lm_ro_datasets:
- ret = dataset.lpccd_start(log)
- if ret:
- log.cl_error("failed to start readonly dataset")
- return -1
-
- return 0
-
- def lm_stop(self, log):
- """
- Stop LPCC manager
- """
- for lpcc_rw_dataset in self.lm_rw_datasets:
- lpcc_rw_dataset.lpccd_stop(log)
-
- for lpcc_rw_dataset in self.lm_ro_datasets:
- lpcc_rw_dataset.lpccd_stop(log)
-
-
-def manage_lpcc_locked(log, workspace, config_fpath):
- """
- Manage LPCC clients holding the lock
- """
- manager = LPCCManager(workspace, config_fpath)
- ret = manager.lm_parse(log)
- if ret:
- return ret
-
- ret = manager.lm_start(log)
- if ret:
- return ret
-
- while not daemon.SHUTTING_DOWN:
- time.sleep(1)
-
- ret = manager.lm_stop(log)
-
- return 0
-
-
-def manage_lpcc(log, workspace, config_fpath):
- """
- Manage LPCC clients
- """
- # pylint: disable=bare-except
- lock_file = config_fpath + ".lock"
- lock = filelock.FileLock(lock_file)
- try:
- with lock.acquire(timeout=0):
- try:
- ret = manage_lpcc_locked(log, workspace, config_fpath)
- except:
- ret = -1
- log.cl_error("exception: %s", traceback.format_exc())
- lock.release()
- except filelock.Timeout:
- ret = -1
- log.cl_error("someone else is holding lock of file [%s], aborting "
- "to prevent conflicts", lock_file)
- return ret
-
-
-def lpcc_dataset_list(log, host, lustre_mount_point):
- """
- List the datasets on a Lustre mount point
- """
- # pylint: disable=too-many-locals
- client_name = host.lsh_getname(log, lustre_mount_point)
- if client_name is None:
- return None
-
- fsname = client_name.split('-')[0]
- lustre_fs = lustre.LustreFilesystem(fsname)
- lustre_client = lustre.LustreClient(log, lustre_fs, host, lustre_mount_point)
-
- command = ("cat /proc/fs/lustre/llite/%s/pcc" % client_name)
- retval = host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return None
-
- dataset_pattern = (r"^(?P<root>\S+) (?P<archive_id>\S+) (?P<projid>\S+)$")
- dataset_regular = re.compile(dataset_pattern)
- datasets = []
- for line in retval.cr_stdout.splitlines():
- log.cl_debug("parsing line [%s] to get dataset", line)
- match = dataset_regular.match(line)
- if match:
- root = match.group("root")
- archive_id = match.group(STR_ARCHIVE_ID)
- projid = match.group("projid")
- dataset = LPCCDataset(client_name, LPCC_TYPE_NONE, root,
- archive_id, projid, lustre_client)
- datasets.append(dataset)
- log.cl_debug("LPCC dataset [%s] configured on dir [%s] of host "
- "[%s]", root, lustre_mount_point, host.sh_hostname)
- else:
- reason = ("failed to parse line [%s] to get dataset" % line)
- log.cl_error(reason)
- raise Exception(reason)
- return datasets
-
-
-def lpcc_dataset_stop(log, host, lustre_mount_point):
- """
- Stop the datasets on a Lustre mount point
- """
- datasets = lpcc_dataset_list(log, host, lustre_mount_point)
- for dataset in datasets:
- ret = dataset.lpccd_stop(log)
- if ret:
- return ret
- return 0
-
-
-class PCCState(object):
- """
- The HSM state
- """
- # pylint: disable=too-few-public-methods
- def __init__(self, pcc_type, pcc_file=None, user_number=None, attr_cached=None):
- self.ps_type = pcc_type
- self.ps_pcc_file = pcc_file
- self.ps_user_number = user_number
- self.ps_attr_cached = attr_cached
-
-
-def lfs_pcc_state(log, fpath, host=None):
- """
- PCC state
- """
- command = ("lfs pcc_state %s" % (fpath))
- extra_string = ""
- if host is None:
- retval = utils.run(command)
- else:
- retval = host.sh_run(log, command)
- extra_string = ("on host [%s]" % host.sh_hostname)
- if retval.cr_exit_status != 0:
- log.cl_error("failed to run command [%s]%s, "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command, extra_string,
- retval.cr_exit_status, retval.cr_stdout,
- retval.cr_stderr)
- return None
-
- file_part = "file: %s, " % fpath
-
- output = retval.cr_stdout.strip()
- if not output.startswith(file_part):
- log.cl_error("unexpected output [%s]", output)
- return None
-
- fpath_len = len(file_part)
- output = output[fpath_len:]
-
- type_none = "type: none"
- if output == type_none:
- return PCCState("none")
-
- match = LPCC_STATE_REGULAR.match(output)
- if not match:
- log.cl_error("output [%s] doesn't mather pattern [%s]",
- output, LPCC_STATE_PATTERN)
- return None
-
- pcc_type = match.group("pcc_type")
- pcc_file = match.group("pcc_file")
- user_number = match.group("user_number")
- attr_cached = match.group("attr_cached")
- return PCCState(pcc_type, pcc_file=pcc_file, user_number=user_number,
- attr_cached=attr_cached)
-
-
-def main():
- """
- Run LPCC manager
- """
- # pylint: disable=unused-variable,not-callable
- if sys.version[0] == '2':
- reload(sys)
- if hasattr(sys, "setdefaultencoding"):
- set_encoding = getattr(sys, "setdefaultencoding", None)
- set_encoding('UTF-8')
- else:
- os.environ["PYTHONIOENCODING"] = 'UTF-8'
- config_fpath = LPCC_CONFIG
-
- if len(sys.argv) == 2:
- config_fpath = sys.argv[1]
- elif len(sys.argv) > 2:
- usage()
- sys.exit(-1)
-
- identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S")
- workspace = LPCC_LOG_DIR + "/" + identity
-
- if not os.path.exists(LPCC_LOG_DIR):
- ret = utils.mkdir(LPCC_LOG_DIR)
- if ret:
- utils.eprint("failed to create directory [%s]" % LPCC_LOG_DIR)
- sys.exit(-1)
- elif not os.path.isdir(LPCC_LOG_DIR):
- utils.eprint("[%s] is not a directory" % LPCC_LOG_DIR)
- sys.exit(-1)
-
- if not os.path.exists(workspace):
- ret = utils.mkdir(workspace)
- if ret:
- utils.eprint("failed to create directory [%s]" % workspace)
- sys.exit(-1)
- elif not os.path.isdir(workspace):
- utils.eprint("[%s] is not a directory" % workspace)
- sys.exit(-1)
-
- log = clog.get_log(resultsdir=workspace)
- log.cl_info("started LPCC manager using config [%s], please check [%s] for "
- "more log" % (config_fpath, workspace))
- signal.signal(signal.SIGINT, daemon.signal_handler)
- signal.signal(signal.SIGTERM, daemon.signal_handler)
-
- save_fpath = workspace + "/" + LPCC_CONFIG_FNAME
- log.cl_debug("copying config file from [%s] to [%s]", config_fpath,
- save_fpath)
- shutil.copyfile(config_fpath, save_fpath)
- ret = manage_lpcc(log, workspace, config_fpath)
- sys.exit(ret)
+++ /dev/null
-# Copyright (c) 2017 DataDirect Networks, Inc.
-# All Rights Reserved.
-# Author: lixi@ddn.com
-
-"""
-Library for cleanup LPCC(Lustre Persistent Client Cache) storage
-"""
-
-import sys
-import os
-import getopt
-
-# Local libs
-from pylustre import utils
-from pylustre import time_util
-from pylustre import clog
-from pylustre import hsm_check
-
-LPCC_CLEANUP_LOG_DIR = "/var/log/lpcc_cleanup"
-
-
-def usage():
- """
- Print usage string
- """
- utils.oprint("Usage: %s <--pcc_root pcc_root> <--lustre_mnt lustre_mnt> " %
- sys.argv[0])
-
-
-def lfs_pcc_detach_fid(log, lustre_mnt, fid):
- """
- Transfer FID to fpath
- """
- command = ("lfs pcc_detach_fid %s %s" % (lustre_mnt, fid))
- retval = utils.run(command)
- if retval.cr_exit_status != 0:
- log.cl_error("failed to run command [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- retval.cr_exit_status, retval.cr_stdout,
- retval.cr_stderr)
- return None
-
- return retval.cr_exit_status
-
-
-def pcc_fid_detach(log, lustre_mnt, fid_name):
- """
- Detach FID exists from PCC
- """
- ret = lfs_pcc_detach_fid(log, lustre_mnt, fid_name)
- if ret:
- log.cl_error("failed to detach FID [%s] on Lustre file system [%s] from PCC",
- fid_name, lustre_mnt)
- return -1
- return 0
-
-
-def main():
- """
- Cleanup LPCC
- """
- # pylint: disable=unused-variable,not-callable
- if sys.version[0] == '2':
- reload(sys)
- if hasattr(sys, "setdefaultencoding"):
- set_encoding = getattr(sys, "setdefaultencoding", None)
- set_encoding('UTF-8')
- else:
- os.environ["PYTHONIOENCODING"] = 'UTF-8'
-
- options, remainder = getopt.getopt(sys.argv[1:],
- "h",
- ["help",
- "pcc_root=",
- "lustre_mnt="])
-
- pcc_root = None
- lustre_mnt = None
- for opt, arg in options:
- if opt == "--pcc_root":
- pcc_root = arg.rstrip('/')
- elif opt == "--lustre_mnt":
- lustre_mnt = arg
- elif opt == '-h' or opt == "--help":
- usage()
- sys.exit(0)
- if pcc_root is None or lustre_mnt is None:
- usage()
- sys.exit(-1)
-
- identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S")
- workspace = LPCC_CLEANUP_LOG_DIR + "/" + identity
-
- if not os.path.exists(LPCC_CLEANUP_LOG_DIR):
- ret = utils.mkdir(LPCC_CLEANUP_LOG_DIR)
- if ret:
- sys.stderr.write("failed to create directory [%s]" % LPCC_CLEANUP_LOG_DIR)
- sys.exit(-1)
- elif not os.path.isdir(LPCC_CLEANUP_LOG_DIR):
- sys.stderr.write("[%s] is not a directory" % LPCC_CLEANUP_LOG_DIR)
- sys.exit(-1)
-
- if not os.path.exists(workspace):
- ret = utils.mkdir(workspace)
- if ret:
- sys.stderr.write("failed to create directory [%s]" % workspace)
- sys.exit(-1)
- elif not os.path.isdir(workspace):
- sys.stderr.write("[%s] is not a directory" % workspace)
- sys.exit(-1)
-
- log = clog.get_log(resultsdir=workspace)
- log.cl_info("started LPCC cleanup, please check [%s] for "
- "more log" % (workspace))
- ret = hsm_check.hsm_process(log, lustre_mnt, pcc_root, pcc_fid_detach)
- sys.exit(ret)
+++ /dev/null
-# Copyright (c) 2017 DataDirect Networks, Inc.
-# All Rights Reserved.
-# Author: lixi@ddn.com
-
-"""
-Library for testing LPCC(Lustre Persistent Client Cache)
-"""
-# pylint: disable=too-many-lines
-import sys
-import traceback
-import signal
-import os
-import shutil
-import filelock
-
-# Local libs
-from pylpcc import lpcc
-from pylustre import lustre
-from pylustre import lustre_test
-from pylustre import utils
-from pylustre import time_util
-from pylustre import daemon
-from pylustre import clog
-
-LPCC_LOG_TEST_DIR = "/var/log/lpcc_test"
-MANAGER = None
-LPCC_TESTS = []
-
-
-def usage():
- """
- Print usage string
- """
- utils.oprint("Usage: %s <config_file>" % sys.argv[0])
-
-
-def check_file_size(log, host, fpath, expected_size):
- """
- Check the file size
- """
- command = ("stat --printf=%%s %s" % (fpath))
- retval = host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
- size = int(retval.cr_stdout)
-
- if size != expected_size:
- log.cl_error("wrong size of file [%s], expected [%s], got [%s]",
- fpath, expected_size, size)
- return -1
- return 0
-
-
-def check_lpcc_sizes(log, lpcc_host, lpcc_fpath, lustre_fpath, expected_size):
- """
- Check the LPCC file sizes
- """
- ret = check_file_size(log, lpcc_host, lpcc_fpath, expected_size)
- if ret:
- log.cl_error("wrong size of LPCC file")
- return ret
- ret = check_file_size(log, lpcc_host, lustre_fpath, expected_size)
- if ret:
- log.cl_error("wrong size of Lustre file")
- return ret
- return 0
-
-
-def check_file_data(log, host, fpath, expected_data):
- """
- Check the file data
- """
- # Read data before checking size since this might trigger HSM restore
- command = ("cat %s" % (fpath))
- retval = host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- if retval.cr_stdout != expected_data:
- log.cl_error("wrong data of file [%s], expected [%s], "
- "got [%s]", fpath, expected_data,
- retval.cr_stdout)
- return -1
-
- expected_size = len(expected_data)
- ret = check_file_size(log, host, fpath, expected_size)
- if ret:
- log.cl_error("wrong size of file [%s]", fpath)
- return -1
- return 0
-
-
-def check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, expected_data):
- """
- Check the LPCC file data
- """
- ret = check_file_data(log, lpcc_host, lpcc_fpath, expected_data)
- if ret:
- log.cl_error("wrong data of LPCC file")
- return ret
-
- ret = check_file_data(log, lpcc_host, lustre_fpath, expected_data)
- if ret:
- log.cl_error("wrong data of Lustre file")
- return ret
-
- return 0
-
-
-def check_multiop_exists(log, lpcc_rw_datasets):
- """
- Check that all hosts has multiop command
- """
- for lpcc_rw_dataset in lpcc_rw_datasets:
- host = lpcc_rw_dataset.lpccd_host
- ret = host.sh_file_executable(log, lustre_test.MULTIOP)
- if ret:
- log.cl_error("command [%s] is doesn't exist on host [%s]",
- lustre_test.MULTIOP, host.sh_hostname)
- return ret
- return 0
-
-
-def lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath):
- """
- Cleanup the test directory
- """
- command = ("rm %s -f" % (lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- command = ("test -e %s" % (lustre_dir))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- command = ("rmdir %s" % (lustre_dir))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- command = ("mkdir %s" % (lustre_dir))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
- return 0
-
-
-def lpcc_rw_test(log, restore=False, project=True):
- """
- Run LPCC readwrite tests
- """
- # pylint: disable=too-many-return-statements,too-many-locals
- # pylint: disable=too-many-statements,too-many-branches
- # pylint: disable=no-self-use
- lpcc_nclient = len(MANAGER.lm_rw_datasets)
- if lpcc_nclient < 1:
- log.cl_debug("not enough LPCC client")
- return 1
- lpcc_rw_dataset = MANAGER.lm_rw_datasets[0]
- lpcc_root = lpcc_rw_dataset.lpccd_root
- lpcc_archive_id = lpcc_rw_dataset.lpccd_id
- lpcc_host = lpcc_rw_dataset.lpccd_host
- lustre_mnt = lpcc_rw_dataset.lpccd_lustre_mnt
-
- lustre_dirname = "dir"
- lustre_fname = "file"
- lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname))
- lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname))
- ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath)
- if ret:
- log.cl_error("failed to cleanup test file")
- return -1
-
- if project:
- project_supported = lpcc_host.sh_chattr_has_projid_support(log)
- if not project_supported:
- log.cl_error("project is not supported by chattr, please upgrade "
- "E2fsprogs to latest Lustre version")
- return -1
-
- command = ("chattr -p %d %s" % (lpcc_rw_dataset.lpccd_projid, lustre_dir))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- log.cl_error("project support might not be enabled, you might need "
- "to run [tune2fs -O project $DEV] on all Lustre devices")
- return -1
-
- file_data = "fetch_origin"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- if not project:
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_NONE, pcc_state.ps_type)
-
- command = ("lfs pcc_fetch -a %s %s" %
- (lpcc_archive_id, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_READWRITE:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_READWRITE, pcc_state.ps_type)
- return -1
-
- hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED |
- lustre.HSMState.HS_RELEASED)
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- log.cl_error("failed to check HSM status after creating LPCC file [%s]",
- lustre_fpath)
- return ret
-
- fid_string = lustre.lfs_path2fid(log, lpcc_host, lustre_fpath)
- if fid_string is None:
- log.cl_error("failed to get fid from path [%s]", lustre_fpath)
- return -1
-
- lustre_fid = lustre.LustreFID(log, fid_string)
- lpcc_fpath = lustre_fid.lf_posix_archive_path(lpcc_root)
- command = ("ls -l %s" % (lpcc_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data)
- if ret:
- log.cl_error("wrong file data after creation")
- return ret
-
- size = 7654321
- command = ("dd if=/dev/zero of=%s bs=%s count=1" %
- (lustre_fpath, size))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- ret = check_lpcc_sizes(log, lpcc_host, lpcc_fpath, lustre_fpath, size)
- if ret:
- log.cl_error("wrong file size after wrote file")
- return ret
-
- size = 1234567
- command = ("truncate -s %s %s" % (size, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- ret = check_lpcc_sizes(log, lpcc_host, lpcc_fpath, lustre_fpath, size)
- if ret:
- log.cl_error("wrong file size after truncated file")
- return ret
-
- file_data = "file_data"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data)
- if ret:
- log.cl_error("wrong file data after written")
- return ret
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_READWRITE:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_READWRITE, pcc_state.ps_type)
- return -1
-
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- log.cl_error("failed to check HSM status after written LPCC file [%s]",
- lustre_fpath)
- return ret
-
- hsm_states = lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED
- if lpcc_nclient < 2 or restore:
- log.cl_debug("restoring the PCC file using command")
- ret = lustre.lfs_hsm_restore(log, lustre_fpath, host=lpcc_host)
- if ret:
- log.cl_error("failed to restore file [%s]", lustre_fpath)
- return ret
-
- ret = lustre.wait_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- log.cl_error("failed to wait status after restoring file [%s]",
- lustre_fpath)
- return ret
- else:
- log.cl_debug("accessing the data to trigger restoring of the PCC file")
- for remote_client in MANAGER.lm_rw_datasets[1:]:
- remote_host = remote_client.lpccd_host
- remote_mnt = remote_client.lpccd_lustre_mnt
- remote_dir = ("%s/%s" % (remote_mnt, lustre_dirname))
- remote_fpath = ("%s/%s" % (remote_dir, lustre_fname))
- ret = check_file_data(log, remote_host, remote_fpath, file_data)
- if ret:
- log.cl_error("wrong file data on the remote client [%s]",
- remote_host.sh_hostname)
- return ret
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_NONE, pcc_state.ps_type)
- return -1
-
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- log.cl_error("failed to check HSM status after restoring file [%s]",
- lustre_fpath)
- return ret
-
- # The file has been restored, thus no LPCC cache now
- file_data = "new_data"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_NONE, pcc_state.ps_type)
- return -1
-
- hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED |
- lustre.HSMState.HS_DIRTY)
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- log.cl_error("failed to check HSM status after writing to restored file [%s]",
- lustre_fpath)
- return ret
-
- for dataset in MANAGER.lm_rw_datasets:
- host = dataset.lpccd_host
- ret = check_file_data(log, host, lustre_fpath, file_data)
- if ret:
- log.cl_error("wrong file data on the client [%s]",
- host.sh_hostname)
- return ret
- return 0
-
-
-def test_lfs_pcc_fetch_restore(log):
- """
- Test lfs pcc_fetch with HSM restore
- """
- return lpcc_rw_test(log, restore=True, project=False)
-
-
-LPCC_TESTS.append(test_lfs_pcc_fetch_restore)
-
-
-def test_lfs_pcc_fetch_access(log):
- """
- Test lfs pcc_fetch with remote access
- """
- return lpcc_rw_test(log, restore=False, project=False)
-
-
-LPCC_TESTS.append(test_lfs_pcc_fetch_access)
-
-
-def test_project_restore(log):
- """
- Test project ID with HSM restore
- """
- return lpcc_rw_test(log, restore=True, project=True)
-
-
-LPCC_TESTS.append(test_project_restore)
-
-
-def test_project_access(log):
- """
- Test project ID with remote access
- """
- return lpcc_rw_test(log, restore=False, project=True)
-
-
-LPCC_TESTS.append(test_project_access)
-
-
-def test_multi_open_when_creating(log):
- # pylint: disable=no-self-use,too-many-locals,too-many-return-statements
- # pylint: disable=too-many-statements
- """
- When a process created a LPCC file and holding the open, another
- process on the same client should be able to open the file.
- """
- lpcc_nclient = len(MANAGER.lm_rw_datasets)
- if lpcc_nclient < 1:
- log.cl_debug("not enough LPCC client")
- return 1
-
- if check_multiop_exists(log, MANAGER.lm_rw_datasets):
- log.cl_debug("multiop command doesn't exist")
- return 1
-
- lpcc_rw_dataset = MANAGER.lm_rw_datasets[0]
- lpcc_root = lpcc_rw_dataset.lpccd_root
- lpcc_archive_id = lpcc_rw_dataset.lpccd_id
- lpcc_host = lpcc_rw_dataset.lpccd_host
-
- lustre_mnt = lpcc_rw_dataset.lpccd_lustre_mnt
- lustre_dirname = "dir"
- lustre_fname = "file"
- lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname))
- lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname))
- ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath)
- if ret != 0:
- log.cl_error("failed to cleanup test file")
- return -1
-
- command = ("chattr -p %d %s" % (lpcc_rw_dataset.lpccd_projid, lustre_dir))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status != 0:
- log.cl_error("failed to run command [%s]", command)
- return -1
-
- multiop = lustre_test.Multiop(lpcc_host, lustre_fpath, "vO_c",
- "/tmp/multiop.stdout",
- "/tmp/multiop.stderr")
- multiop.mop_start(log)
- ret = multiop.mop_wait_pausing(log)
- if ret:
- return ret
-
- hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED |
- lustre.HSMState.HS_RELEASED)
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- return ret
-
- file_data = "multiopen_data"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status != 0:
- log.cl_error("failed to run command [%s]", command)
- return -1
-
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret:
- return ret
-
- fid_string = lustre.lfs_path2fid(log, lpcc_host, lustre_fpath)
- if fid_string is None:
- log.cl_error("failed to get fid from path [%s]", lustre_fpath)
- return -1
-
- lustre_fid = lustre.LustreFID(log, fid_string)
- lpcc_fpath = lustre_fid.lf_posix_archive_path(lpcc_root)
- command = ("ls -l %s" % (lpcc_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status != 0:
- log.cl_error("failed to run command [%s]", command)
- return -1
-
- ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data)
- if ret != 0:
- log.cl_error("failed to check lpcc data")
- return -1
-
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret != 0:
- log.cl_error("failed to check hsm state")
- return -1
-
- multiop.mop_pkill(log)
- return 0
-
-
-LPCC_TESTS.append(test_multi_open_when_creating)
-
-
-def test_remote_local_open(log):
- # pylint: disable=no-self-use,too-many-locals,too-many-return-statements
- # pylint: disable=too-many-statements
- """
- When a process created a LPCC file and holding the open, another
- process on the different client should not be able to open the file.
- """
- lpcc_nclient = len(MANAGER.lm_rw_datasets)
- if lpcc_nclient < 2:
- log.cl_debug("not enough LPCC client")
- return 1
-
- if check_multiop_exists(log, MANAGER.lm_rw_datasets):
- log.cl_debug("multiop command doesn't exist")
- return 1
-
- lpcc_rw_dataset = MANAGER.lm_rw_datasets[0]
- lpcc_archive_id = lpcc_rw_dataset.lpccd_id
- lpcc_host = lpcc_rw_dataset.lpccd_host
-
- lustre_mnt = lpcc_rw_dataset.lpccd_lustre_mnt
- lustre_dirname = "dir"
- lustre_fname = "file"
- lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname))
- lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname))
- ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath)
- if ret != 0:
- log.cl_error("failed to cleanup test file")
- return -1
-
- command = ("chattr -p %d %s" % (lpcc_rw_dataset.lpccd_projid, lustre_dir))
- retval = lpcc_host.sh_run(log, command)
- if ret != 0:
- log.cl_error("failed to run command [%s]", command)
- return -1
-
- multiop = lustre_test.Multiop(lpcc_host, lustre_fpath, "vO_c",
- "/tmp/multiop.stdout",
- "/tmp/multiop.stderr")
- multiop.mop_start(log)
- ret = multiop.mop_wait_pausing(log)
- if ret != 0:
- log.cl_error("failed to wait multiop")
- return -1
-
- hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED |
- lustre.HSMState.HS_RELEASED)
- ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states,
- archive_id=lpcc_archive_id, host=lpcc_host)
- if ret != 0:
- log.cl_error("failed to check HSM state")
- return -1
-
- remote_client = MANAGER.lm_rw_datasets[1]
- remote_host = remote_client.lpccd_host
-
- command = ("cat %s" % (lustre_fpath))
- retval = remote_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- log.cl_error("command [%s] succeeded unexpectedly", command)
- return -1
-
- file_data = "multiopen_data"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = remote_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- log.cl_error("command [%s] succeeded unexpectedly", command)
- return -1
-
- multiop.mop_signal(log)
- multiop.mop_wait_exit(log)
-
- command = ("cat %s" % (lustre_fpath))
- retval = remote_host.sh_run(log, command)
- if retval.cr_exit_status != 0:
- log.cl_error("command [%s] failed unexpectedly", command)
- return -1
-
- file_data = "multiopen_data"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = remote_host.sh_run(log, command)
- if retval.cr_exit_status != 0:
- log.cl_error("command [%s] failed unexpectedly", command)
- return -1
- return 0
-
-
-LPCC_TESTS.append(test_remote_local_open)
-
-
-def lpcc_ro_test(log):
- # pylint: disable=too-many-locals,no-self-use
- # pylint: disable=too-many-statements,too-many-branches
- # pylint: disable=too-many-return-statements,unused-variable
- """
- Run LPCC readonly tests
- """
- dataset_number = len(MANAGER.lm_ro_datasets)
- if dataset_number < 1:
- log.cl_info("not enough LPCC readonly dataset")
- return 1
- lpcc_ro_dataset = MANAGER.lm_ro_datasets[0]
- lpcc_client = lpcc_ro_dataset.lpccd_lustre_client
- lpcc_root = lpcc_ro_dataset.lpccd_root
- lpcc_group_id = lpcc_ro_dataset.lpccd_id
- group_datasets = MANAGER.lm_ro_dataset_groups[lpcc_group_id]
- group_clients = []
- for group_dataset in group_datasets:
- lustre_client = group_dataset.lpccd_lustre_mnt
- if lustre_client not in group_clients:
- group_clients.append(lustre_client)
-
- none_group_clients = []
- for client_id, client in MANAGER.lm_lustre_clients.iteritems():
- if client not in group_clients:
- none_group_clients.append(lustre_client)
-
- lpcc_host = lpcc_ro_dataset.lpccd_host
- lustre_mnt = lpcc_ro_dataset.lpccd_lustre_mnt
-
- lustre_dirname = "dir_ro"
- lustre_fname = "file"
- lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname))
- lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname))
- ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath)
- if ret:
- log.cl_error("failed to cleanup test file")
- return -1
-
- file_data = "fetch_origin"
- command = ("echo -n %s > %s" % (file_data, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_NONE, pcc_state.ps_type)
- return -1
-
- command = ("lfs pcc_fetch -r -a %s %s" % (lpcc_group_id, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_READONLY:
- log.cl_error("wrong PCC type, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_READONLY, pcc_state.ps_type)
-
- fid_string = lustre.lfs_path2fid(log, lpcc_host, lustre_fpath)
- if fid_string is None:
- return -1
-
- lustre_fid = lustre.LustreFID(log, fid_string)
- lpcc_fpath = lustre_fid.lf_posix_archive_path(lpcc_root)
- command = ("ls -l %s" % (lpcc_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data)
- if ret:
- log.cl_error("wrong file data after creation")
- return ret
-
- for client_id, client in MANAGER.lm_lustre_clients.iteritems():
- if client == lpcc_client:
- continue
- host = client.lc_host
- remote_lustre_fpath = ("%s/%s/%s" %
- (client.lc_mnt, lustre_dirname,
- lustre_fname))
-
- # File read without grouplock should be blocked
- multiop = lustre_test.Multiop(host, remote_lustre_fpath, "vor10",
- "/tmp/multiop.stdout",
- "/tmp/multiop.stderr")
- multiop.mop_start(log)
- ret = multiop.mop_wait_exit(log, timeout=3, quiet=True)
- if ret == 0:
- log.cl_error("file read on host [%s] is not blocked by group "
- "lock on host [%s]", host.sh_hostname,
- lpcc_host.sh_hostname)
- return -1
-
- multiop.mop_pkill(log)
- ret = multiop.mop_wait_exit(log)
- if ret:
- log.cl_error("file read on host [%s] is not canceled",
- host.sh_hostname)
- return -1
-
- # Not able to truncate file
- command = ("truncate -s 0 %s" % (lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- log.cl_error("command [%s] succeeded on host [%s], which is unexpected",
- command, lpcc_host.sh_hostname)
- return -1
-
- command = ("truncate -s 1048576 %s" % (lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- log.cl_error("command [%s] succeeded on host [%s], which is unexpected",
- command, lpcc_host.sh_hostname)
- return -1
-
- # Not able to write data to readonly cache
- command = ("echo -n not_written > %s" % (lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- log.cl_error("command [%s] succeeded on host [%s], which is unexpected",
- command, lpcc_host.sh_hostname)
- return -1
-
- # Not able to append to readonly cache
- command = ("echo -n not_written >> %s" % (lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status == 0:
- log.cl_error("command [%s] succeeded on host [%s], which is unexpected",
- command, lpcc_host.sh_hostname)
- return -1
-
- # Check data again in case any data is changed
- ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data)
- if ret:
- log.cl_error("wrong file data after truncate and write failures")
- return ret
-
- # Detch and re-attach
- command = ("lfs pcc_detach %s" % (lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE:
- log.cl_error("wrong PCC type after detach, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_NONE, pcc_state.ps_type)
- return -1
-
- command = ("lfs pcc_fetch -r -a %s %s" % (lpcc_group_id, lustre_fpath))
- retval = lpcc_host.sh_run(log, command)
- if retval.cr_exit_status:
- log.cl_error("failed to run command [%s] on host [%s], "
- "ret = [%d], stdout = [%s], stderr = [%s]",
- command,
- lpcc_host.sh_hostname,
- retval.cr_exit_status,
- retval.cr_stdout,
- retval.cr_stderr)
- return -1
-
- pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host)
- if pcc_state.ps_type != lpcc.LPCC_TYPE_READONLY:
- log.cl_error("wrong PCC type after re-prefetch, expected [%s], got [%s]",
- lpcc.LPCC_TYPE_READONLY, pcc_state.ps_type)
- return -1
- return 0
-
-
-def test_readonly(log):
- """
- Test readonly PCC
- """
- return lpcc_ro_test(log)
-
-
-LPCC_TESTS.append(test_readonly)
-
-
-def test_lpcc_locked(log, workspace, config_fpath):
- """
- Start to run LPCC tests holding the confiure lock
- """
- # pylint: disable=global-statement,too-many-branches
- lpcc_workspace = workspace + "/lpcc"
-
- ret = utils.mkdir(lpcc_workspace)
- if ret:
- log.cl_error("failed to creat directory [%s]", lpcc_workspace)
- return ret
- global MANAGER
- MANAGER = lpcc.LPCCManager(lpcc_workspace, config_fpath)
- ret = MANAGER.lm_parse(log)
- if ret:
- return ret
-
- ret = MANAGER.lm_start(log)
- if ret:
- return ret
-
- quit_on_error = True
- only_test = None
- passed_tests = []
- failed_tests = []
- skipped_tests = []
- for lpcc_test in LPCC_TESTS:
- if only_test is not None and only_test != lpcc_test.__name__:
- continue
- log.cl_info("test [%s] started", lpcc_test.__name__)
- ret = lpcc_test(log)
- if ret < 0:
- log.cl_error("test [%s] failed", lpcc_test.__name__)
- failed_tests.append(lpcc_test)
- if quit_on_error:
- return -1
- elif ret == 1:
- log.cl_warning("test [%s] skipped", lpcc_test.__name__)
- skipped_tests.append(lpcc_test)
- else:
- log.cl_info("test [%s] passed", lpcc_test.__name__)
- passed_tests.append(lpcc_test)
-
- if len(skipped_tests) != 0:
- for skipped_test in skipped_tests:
- log.cl_warning("test [%s] skipped", skipped_test.__name__)
-
- if len(failed_tests) != 0:
- for failed_test in failed_tests:
- log.cl_error("test [%s] failed", failed_test.__name__)
-
- if len(passed_tests) != 0:
- for passed_test in passed_tests:
- log.cl_info("test [%s] passed", passed_test.__name__)
-
- daemon.SHUTTING_DOWN = True
- ret = MANAGER.lm_stop(log)
- if ret:
- log.cl_error("failed to stop lpcc manager")
-
- if len(failed_tests) != 0 or ret:
- return -1
- return 0
-
-
-def test_lpcc(log, workspace, config_fpath):
- """
- Start to run LPCC tests
- """
- # pylint: disable=bare-except
- lock_file = config_fpath + ".lock"
- lock = filelock.FileLock(lock_file)
- try:
- with lock.acquire(timeout=0):
- try:
- ret = test_lpcc_locked(log, workspace, config_fpath)
- except:
- ret = -1
- log.cl_error("exception: %s", traceback.format_exc())
- lock.release()
- except filelock.Timeout:
- ret = -1
- log.cl_error("someone else is holding lock of file [%s], aborting "
- "to prevent conflicts", lock_file)
- return ret
-
-
-def main():
- """
- Run LPCC tests
- """
- # pylint: disable=unused-variable,not-callable
- if sys.version[0] == '2':
- reload(sys)
- if hasattr(sys, "setdefaultencoding"):
- set_encoding = getattr(sys, "setdefaultencoding", None)
- set_encoding('UTF-8')
- else:
- os.environ["PYTHONIOENCODING"] = 'UTF-8'
-
- config_fpath = lpcc.LPCC_CONFIG
-
- if len(sys.argv) == 2:
- config_fpath = sys.argv[1]
- elif len(sys.argv) > 2:
- usage()
- sys.exit(-1)
-
- identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S")
- workspace = LPCC_LOG_TEST_DIR + "/" + identity
-
- if not os.path.exists(LPCC_LOG_TEST_DIR):
- ret = utils.mkdir(LPCC_LOG_TEST_DIR)
- if ret:
- utils.eprint("failed to create directory [%s] on local host" % LPCC_LOG_TEST_DIR)
- sys.exit(-1)
- elif not os.path.isdir(LPCC_LOG_TEST_DIR):
- utils.eprint("[%s] is not a directory" % LPCC_LOG_TEST_DIR)
- sys.exit(-1)
-
- if not os.path.exists(workspace):
- ret = utils.mkdir(workspace)
- if ret:
- utils.eprint("failed to create directory [%s] on local host" % workspace)
- sys.exit(-1)
- elif not os.path.isdir(workspace):
- utils.eprint("[%s] is not a directory" % workspace)
- sys.exit(-1)
-
- signal.signal(signal.SIGINT, daemon.signal_handler)
- signal.signal(signal.SIGTERM, daemon.signal_handler)
- log = clog.get_log(resultsdir=workspace)
- log.cl_info("started LPCC test using config [%s], please check [%s] for "
- "more log" % (config_fpath, workspace))
-
- save_fpath = workspace + "/" + lpcc.LPCC_CONFIG_FNAME
- log.cl_debug("copying config file from [%s] to [%s]", config_fpath,
- save_fpath)
- shutil.copyfile(config_fpath, save_fpath)
- ret = test_lpcc(log, workspace, config_fpath)
- if ret:
- log.cl_error("test failed, please check [%s] for more log", workspace)
- sys.exit(ret)
- log.cl_info("all tests passed, please check [%s] for more log", workspace)
- sys.exit(0)
#define OPT_CLEAR_HASHDIR 3
#define OPT_LOG_LEVEL 4
#define OPT_MAX_SCAN_SECS 5
+#define OPT_PIDFILE 6
struct lpcc_purge_options {
char *o_cache;
int o_max_scan_secs;
char *o_dumpfile;
+ char *o_pidfile;
bool o_dry_run;
bool o_clear_hashdir;
};
"\t-t, --scan-threads=NUM scanning threads (default: %u)\n"
"\t --candidate-num=NUM, candidate number of approximate LRU (default: %d, min: %d, max: %d)\n"
"\t --max-scan-secs, max seconds to scan continously before purging (default: %d, min: %d, max: %d)\n"
- "\t-w, --dump=FILE, dump stats to FILE when signal USR1 is recieved (default: /var/run/lpcc_purge-PID.stats)\n"
+ "\t-w, --dump=FILE, dump stats to FILE when signal USR1 is recieved (default: /var/run/lpcc_purge-RWID.stats)\n"
+ "\t --pidfile=FILE, the pidfile name (default: /var/run/lpcc_purge-RWID.pid)\n"
"\t --clear-hashdir, clear empty hash dir after detaching file\n"
"\t --dry-run, scan once but do not detach file really\n"
"\t-h, --help, print this help message\n",
{ "dry-run", no_argument, NULL, OPT_DRY_RUN},
{ "candidate-num", required_argument, NULL, OPT_CANDIDATE_NUM},
{ "dump", required_argument, NULL, 'w'},
+ { "pidfile", required_argument, NULL, OPT_PIDFILE},
{ "clear-hashdir", no_argument, NULL, OPT_CLEAR_HASHDIR},
{ "max-scan-secs", required_argument, NULL, OPT_MAX_SCAN_SECS},
{ "help", no_argument, NULL, 'h' },
case 'w':
opt.o_dumpfile = strdup(optarg);
break;
+ case OPT_PIDFILE:
+ opt.o_pidfile = strdup(optarg);
+ break;
case OPT_DRY_RUN:
opt.o_dry_run = true;
break;
snprintf(buf, sizeof(buf), "/var/run/lpcc_purge-%d.stats", opt.o_rwid);
opt.o_dumpfile = strdup(buf);
}
+ if (opt.o_pidfile == NULL) {
+ snprintf(buf, sizeof(buf), "/var/run/lpcc_purge-%d.pid", opt.o_rwid);
+ opt.o_pidfile = strdup(buf);
+ }
/* check freehi > freelo */
if (opt.o_high_usage <= opt.o_low_usage) {
static void lpcc_purge_lock_pidfile(void)
{
- char buf[PATH_MAX];
int fd;
- snprintf(buf, sizeof(buf), "/var/run/lpcc_purge-%d.pid", opt.o_rwid);
- fd = create_pid_file(buf);
+ fd = create_pid_file(opt.o_pidfile);
if (fd < 0) {
llapi_error(LLAPI_MSG_FATAL, errno,
- "cannot create pidfile '%s'", buf);
+ "cannot create pidfile '%s'", opt.o_pidfile);
exit(1);
}
/* we keep the fd open to hold the flock,
llapi_printf(LLAPI_MSG_DEBUG, "detach fid: "DFID"\n", PFID(&candidate->c_fid));
- pthread_mutex_lock(&stats.s_lock);
- stats.s_purged_objs++;
- stats.s_total_purged_objs++;
- pthread_mutex_unlock(&stats.s_lock);
-
/* double confirm the atime. If it's changed, discard this entry */
rc = stat(candidate->c_path, &statbuf);
if (rc) {
[Unit]
-Description=Lustre Persistent Client Cache
+Description=Lustre Persistent Client Cache Management
+
+Requires=network-online.target
+After=network-online.target
+
+ConditionPathExists=/etc/lpcc.conf
[Service]
Type=simple
-ExecStart=/usr/bin/lpcc
-User=root
+ExecStart=lpcc monitor
+ExecStop=kill $MAINPID
[Install]
WantedBy=multi-user.target
+
lustre_rsync.8 \
nids.5 \
plot-llstat.8 \
- routerstat.8
-
+ routerstat.8 \
+ umount.lustre.8
LIBMAN = \
lustreapi.7 \
--- /dev/null
+.\"@(#)umount.lustre.8"
+.TH UMOUNT.LUSTRE 8 "7 Jul 2021"
+.SH NAME
+umount.lustre \- unmount a Lustre File System
+.SH SYNOPSIS
+.BI "umount.lustre" " dir" " [\-fvnrldh ]"
+.SH DESCRIPTION
+.BR umount.lustre
+is a part of
+.BR lustre (7)
+utilities package, which provides Lustre client functionality.
+
+.BR umount.lustre
+stops any Lustre Persistent Client Cache (LPCC) running on the Lustre file
+system to be unmounted, then do real umounting.
+
+.BR umount.lustre
+are meant to be used by the
+.BR umount (8)
+command for unmounting Lustre file system. This subcommand, however, can also
+be used as a standalone command with limited functionality.
+
+.I dir
+is the directory on which the file system is mounted.
+
+.SH OPTIONS
+.TP
+.BI "\-f"
+Force unmount the file system in case of unreachable Lustre file system.
+.TP
+.BI "\-v"
+Be verbose.
+.TP
+.BI "\-n"
+Do not update
+.I /etc/mtab.
+By default, an entry is created in
+.I /etc/mtab
+for every mounted file system. Use this option to skip deleting an entry.
+.TP
+.BI "\-r"
+In case unmounting fails, try to mount read-only.
+.TP
+.BI "\-l"
+Lazy unmount. Detach the file system from the file system hierarchy now, and
+cleanup all references to the file system as soon as it is not busy anymore.
+.TP
+.BI "\-d"
+When the unmounted device was a loop device, also free this loop device.
+.TP
+.BI "\-h"
+Print help message.
+
+.SH NOTE
+For further information please refer
+.BR lustre (5)
+and
+.BR umount (8)
+manual pages.
+
+.SH FILES
+.TP 18n
+.I /etc/fstab
+file system table
+.TP
+.I /etc/mtab
+table of mounted file systems
+
+.PD
+.SH "SEE ALSO"
+.BR lustre (7),
+.BR umount (8),
+
+.SH "AUTHOR"
+Lei Feng <flei@ddn.com>
SUBDIRS = systemd
-sbin_SCRIPTS = lustre_rmmod ko2iblnd-probe
+sbin_SCRIPTS = lustre_rmmod ko2iblnd-probe umount.lustre
if RHEL
initdir = $(sysconfdir)/init.d
$(addsuffix .in,$(genscripts)) lfs_migrate lustre_req_history \
lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \
zfsobj2fid ko2iblnd-probe statechange-lustre.sh \
- bash-completion/lustre bash-completion/lctl bash-completion/lfs
+ bash-completion/lustre bash-completion/lctl bash-completion/lfs \
+ umount.lustre
CLEANFILES = $(genscripts)
--- /dev/null
+#!/bin/bash
+
+set -eu
+
+
+usage()
+{
+ echo "usage: umount.lustre dir [-fvnrlh]"
+ echo "options:"
+ echo -e "\t-f\tforce umount"
+ echo -e "\t-v\tverbose"
+ echo -e "\t-n\tDo not update /etc/mtab"
+ echo -e "\t-r\tremount"
+ echo -e "\t-l\tlazy umount"
+ echo -e "\t-h\tprint this help"
+}
+
+declare -a args=("$@")
+
+temp=`getopt -o fvnrlh -- "$@"`
+if [[ $? != 0 ]]; then
+ usage
+ exit 1
+fi
+
+eval set -- "$temp"
+
+while true; do
+ case "$1" in
+ -h)
+ usage
+ exit 0
+ ;;
+ -f|-v|-n|-r|-l)
+ shift
+ ;;
+ --)
+ shift
+ mount_point="$1"
+ break
+ ;;
+ *)
+ usage
+ exit 1
+ ;;
+ esac
+done
+
+if [[ -x /usr/bin/lpcc ]] && [[ -S /var/run/lpcc.sock ]]; then
+ /usr/bin/lpcc stop "$mount_point" --keep-enabled > /dev/null
+fi
+
+umount --internal-only "${args[@]}"
+
"R" 20 ||
error "$i: GC-thread not found in R-state"
# check umounts of each MDT on MDS have reached kthread_stop()
- [[ $(do_node $i pgrep umount | wc -l) -eq $nb ]] ||
+ [[ $(do_node $i pgrep umount.lustre | wc -l) -eq $nb ]] ||
error "$i: expected $nb umount"
wait_update $i \
- "ps -C umount -o state --no-headers | uniq" "D" 20 ||
+ "ps -C umount -o state --no-headers | grep D | wc -l" "$nb" 20 ||
error "$i: umount not found in D-state"
done