From: Lei Feng Date: Tue, 27 Jul 2021 07:37:11 +0000 (+0800) Subject: EX-3209 lipe: add lpcc util and service X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=6d62073950ac82f3e5e1361d2eb8a8ac6f19ed02;p=fs%2Flustre-release.git EX-3209 lipe: add lpcc util and service Create lpcc daemon/cli and systemd serivce to manage all PCC devices and services. Create umount.lustre to hook the umounting and stop PCC in advance. Remove unused lpcc_test and lpcc_cleanup. Fix stats mistake for purge_objs. Add --pidfile for lpcc_purge. Change-Id: I941d07b61906e4d5ebee13dab2a8015e43ecf676 Signed-off-by: Lei Feng Test-Parameters: trivial Reviewed-on: https://review.whamcloud.com/44103 Tested-by: jenkins Tested-by: Maloo Reviewed-by: John L. Hammond Reviewed-by: Li Xi --- diff --git a/lipe/Makefile.am b/lipe/Makefile.am index 2a56277..81f257b 100644 --- a/lipe/Makefile.am +++ b/lipe/Makefile.am @@ -48,8 +48,6 @@ PYTHON_COMMANDS = \ loris_crontab \ loris_test \ lpcc \ - lpcc_cleanup \ - lpcc_test \ pyltest_import_check EXTRA_DIST= \ @@ -77,7 +75,6 @@ EXTRA_DIST= \ pylipe/.pylintrc \ pylipe/*.py \ pyloris/*.py \ - pylpcc/*.py \ pylustre/*.py \ pyltest/*.py \ scripts/*.sh \ @@ -87,7 +84,7 @@ EXTRA_DIST= \ .pylintrc PYLTEST_FILES = $(wildcard pyltest/*.py) -PYTHON_LIB_FILES = $(wildcard pyclownfish/*.py pylustre/*.py pyloris/*.py pylhsm/*.py pylpcc/*.py) +PYTHON_LIB_FILES = $(wildcard pyclownfish/*.py pylustre/*.py pyloris/*.py pylhsm/*.py) PYTHON_LIB_FILES += $(PYLTEST_FILES) PYTHON_FILES = $(PYTHON_LIB_FILES) $(PYTHON_COMMANDS) PYTHON_CHECKS = $(PYTHON_FILES:%=%.python_checked) diff --git a/lipe/lipe.spec.in b/lipe/lipe.spec.in index 9e1114d..5792820 100644 --- a/lipe/lipe.spec.in +++ b/lipe/lipe.spec.in @@ -223,10 +223,9 @@ python2 -m py_compile pylustre/*.py python2 -m py_compile pylhsm/*.py python2 -m py_compile pylipe/*.py python2 -m py_compile pyloris/*.py -python2 -m py_compile pylpcc/*.py python2 -m py_compile pyltest/*.py -find pyclownfish pylustre pylhsm pylipe pyloris pylpcc pyltest -maxdepth 1 -type f -a -name "*.python_checked" -o -name "*.py" | xargs rm -f +find pyclownfish pylustre pylhsm pylipe pyloris pyltest -maxdepth 1 -type f -a -name "*.python_checked" -o -name "*.py" | xargs rm -f %install rm -rf $RPM_BUILD_ROOT @@ -236,6 +235,7 @@ mkdir -p $RPM_BUILD_ROOT%{_libdir} mkdir -p $RPM_BUILD_ROOT%{python2_sitelib} mkdir -p $RPM_BUILD_ROOT%{_mandir}/man1 mkdir -p $RPM_BUILD_ROOT%{_mandir}/man5 +mkdir -p $RPM_BUILD_ROOT%{_mandir}/man8 mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/yum.repos.d cp \ ldsync \ @@ -255,8 +255,7 @@ cp \ loris_crontab \ loris_test \ lpcc \ - lpcc_cleanup \ - lpcc_test \ + src/lpcc_purge \ src/ext4_inode2path \ src/lcreatemany \ src/ldumpstripe \ @@ -283,7 +282,6 @@ cp -a pyclownfish $RPM_BUILD_ROOT%{python2_sitelib} cp -a pylhsm $RPM_BUILD_ROOT%{python2_sitelib} cp -a pylipe $RPM_BUILD_ROOT%{python2_sitelib} cp -a pyloris $RPM_BUILD_ROOT%{python2_sitelib} -cp -a pylpcc $RPM_BUILD_ROOT%{python2_sitelib} cp -a pylustre $RPM_BUILD_ROOT%{python2_sitelib} cp -a pyltest $RPM_BUILD_ROOT%{python2_sitelib} mkdir -p $RPM_BUILD_ROOT%{_sysconfdir} @@ -298,6 +296,7 @@ cp -a \ example_configs/clownfish/seperate_mgs/lipe_virt.conf \ lpcc.conf \ $RPM_BUILD_ROOT%{_sysconfdir} + %if %{with laudit} mkdir -p $RPM_BUILD_ROOT%{_sysconfdir}/laudit cp -a laudit.conf.example $RPM_BUILD_ROOT%{_sysconfdir}/laudit @@ -332,6 +331,11 @@ cp -a example_configs/hotpool/* $RPM_BUILD_ROOT%{_sysconfdir}/ install -m 0644 man/lipe_scan.1 $RPM_BUILD_ROOT%{_mandir}/man1/ install -m 0644 man/lipe_find.1 $RPM_BUILD_ROOT%{_mandir}/man1/ install -m 0644 man/lfill.1 $RPM_BUILD_ROOT%{_mandir}/man1/ +install -m 0644 man/lpcc.8 $RPM_BUILD_ROOT%{_mandir}/man8/ +install -m 0644 man/lpcc-start.8 $RPM_BUILD_ROOT%{_mandir}/man8/ +install -m 0644 man/lpcc-stop.8 $RPM_BUILD_ROOT%{_mandir}/man8/ +install -m 0644 man/lpcc-status.8 $RPM_BUILD_ROOT%{_mandir}/man8/ +install -m 0644 man/lpcc.conf.5 $RPM_BUILD_ROOT%{_mandir}/man5/ %if %{with laudit} install -m 0644 man/laudit.1 $RPM_BUILD_ROOT%{_mandir}/man1/ install -m 0644 man/laudit-report.1 $RPM_BUILD_ROOT%{_mandir}/man1/ @@ -374,16 +378,21 @@ rm -rf $RPM_BUILD_ROOT %files lpcc %defattr(-,root,root) -%{python2_sitelib}/pylpcc %{_bindir}/lpcc -%{_bindir}/lpcc_cleanup -%{_bindir}/lpcc_test +%{_bindir}/lpcc_purge %config(noreplace) %{_sysconfdir}/lpcc.conf %if %{with systemd} %{_unitdir}/lpcc.service %else %{_sysconfdir}/rc.d/init.d/lpcc %endif +%{_mandir}/man8/lpcc.8* +%{_mandir}/man8/lpcc-start.8* +%{_mandir}/man8/lpcc-stop.8* +%{_mandir}/man8/lpcc-status.8* +%{_mandir}/man5/lpcc.conf.5* + + %files hsm %defattr(-,root,root) diff --git a/lipe/lpcc b/lipe/lpcc index 73922cc..d573a0b 100755 --- a/lipe/lpcc +++ b/lipe/lpcc @@ -1,11 +1,591 @@ -#!/usr/bin/python2 -u -# Copyright (c) 2017 DataDirect Networks, Inc. +#!/usr/bin/env python3 +# Copyright (c) 2021 DataDirect Networks, Inc. # All Rights Reserved. -# Author: lixi@ddn.com +# Author: flei@ddn.com + """ -LPCC(Lustre Persistent Client Cache) +Manage all PCC devices and services """ -from pylpcc import lpcc + +import argparse +import errno +import json +import os +import select +import signal +import socket +import subprocess +import sys +import time +import yaml + + +LISTEN_SOCK_FN = "/var/run/lpcc.sock" +LISTEN_SOCK = None + +def eprint(*args, **kwargs): + """print something to stderr""" + print(*args, file=sys.stderr, **kwargs) + +class LpccService: + """ + Class to manage single instance of lpcc + """ + copytool_prog = 'lhsmtool_posix' + lpcc_purge_prog = 'lpcc_purge' + + lpcc_config = None + + lpcc_mount = None + lpcc_cache = None + lpcc_roid = None + lpcc_autocache = None + + lpcc_purge_high_usage = 90 + lpcc_purge_low_usage = 75 + lpcc_purge_interval = 5 + lpcc_purge_scan_threads = 1 + + def __init__(self, lpcc_config): + self.lpcc_config = lpcc_config + self.lpcc_mount = lpcc_config['mount'] + self.lpcc_cache = lpcc_config['cache'] + self.lpcc_autocache = lpcc_config['autocache'] + self.lpcc_roid = lpcc_config['roid'] + + lpcc_purge_obj = lpcc_config.get('purge') + if lpcc_purge_obj is not None: + self.lpcc_purge_high_usage = lpcc_purge_obj.get('high_usage', 90) + self.lpcc_purge_low_usage = lpcc_purge_obj.get('low_usage', 75) + self.lpcc_purge_interval = lpcc_purge_obj.get('interval', 30) + self.lpcc_purge_scan_threads = lpcc_purge_obj.get('scan_threads', 1) + + @staticmethod + def _check_process_by_pidfile(procname, pidfile): + """ + Check existence of process with the pid in pidfile. + If pidfile does not exist or is not valid, return False. + """ + cmdline = ['pkill', '--signal', '0', '--pidfile', pidfile, '--', procname] + eprint(cmdline) + cproc = subprocess.run(cmdline, check=False, stderr=subprocess.DEVNULL) + if cproc.returncode != 0: + return False + + return True + + @staticmethod + def _wait_process_by_pidfile(procname, pidfile, secs=5): + """ + Wait for at most secs seconds for the existence of pid in pidfile + """ + for i in range(secs): + if LpccService._check_process_by_pidfile(procname, pidfile): + return True + else: + time.sleep(1) + + return False + + @staticmethod + def _kill_process_by_pidfile(procname, pidfile): + """ + Kill a process with given pid in pidfile + """ + cmdline = ['pkill', '--pidfile', pidfile, '--', procname] + eprint(cmdline) + cproc = subprocess.run(cmdline, check=False) + return cproc.returncode + + def _add_pcc(self): + eprint("Adding PCC...") + + param = '%s roid=%d ropcc=1' % (self.lpcc_autocache, self.lpcc_roid) + cmdline = ['lctl', 'pcc', 'add', self.lpcc_mount, self.lpcc_cache, \ + '--param', param] + eprint(cmdline) + cproc = subprocess.run(cmdline, check=False) + return cproc.returncode + + def _del_pcc(self): + eprint("Deleting PCC...") + cmdline = ['lctl', 'pcc', 'del', self.lpcc_mount, self.lpcc_cache] + eprint(cmdline) + cproc = subprocess.run(cmdline, check=False) + return cproc.returncode + + def _start_lpcc_purge(self): + eprint("Starting lpcc_purge...") + + pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid + cmdline = [self.lpcc_purge_prog, \ + '--mount', self.lpcc_mount, \ + '--cache', self.lpcc_cache, \ + '--roid', str(self.lpcc_roid), \ + '--high-usage', str(self.lpcc_purge_high_usage), \ + '--low-usage', str(self.lpcc_purge_low_usage), \ + '--interval', str(self.lpcc_purge_interval), \ + '--scan-threads', str(self.lpcc_purge_scan_threads), \ + '--pidfile', pidfile] + + eprint(cmdline) + subprocess.Popen(cmdline) + + succ = LpccService._wait_process_by_pidfile(self.lpcc_purge_prog, pidfile) + if not succ: + eprint("lpcc_purge did not start successfully!") + return 1 + + return 0 + + def _stop_lpcc_purge(self): + eprint("Stopping lpcc_purge...") + pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid + self._kill_process_by_pidfile(self.lpcc_purge_prog, pidfile) + + def _dump_config(self): + eprint("========== Config ==========") + yaml.safe_dump(self.lpcc_config, sys.stdout, default_flow_style=False) + eprint("============================") + + def start(self): + """ + Start a PCC device and related services + """ + eprint("Start PCC...") + self._dump_config() + + retcode = self._add_pcc() + if retcode != 0: + return retcode + + retcode = self._start_lpcc_purge() + if retcode != 0: + self._del_pcc() + return retcode + + eprint("Done") + eprint() + return 0 + + def stop(self): + """ + Start a PCC device and related services + """ + eprint("Stop PCC...") + self._dump_config() + + self._stop_lpcc_purge() + self._del_pcc() + + eprint("Done") + eprint() + return 0 + + def status(self): + """ + Get the status of PCC and service + """ + result = {} + result['mount'] = self.lpcc_mount + result['cache'] = self.lpcc_cache + + cmdline = ['lctl', 'pcc', 'list', self.lpcc_mount] + try: + output = subprocess.check_output(cmdline) + except subprocess.CalledProcessError as err: + result['status'] = "error" + result['error_msg'] = os.strerror(err.returncode) + return result + + result['status'] = "stopped" + pcclist = yaml.load(output) + if pcclist is not None and 'pcc' in pcclist: + for pcc in pcclist['pcc']: + if pcc['pccpath'] == self.lpcc_cache: + result['status'] = "running" + result['roid'] = pcc['roid'] + result['autocache'] = pcc['autocache'] + break + + if result['status'] != "running": + return result + + # Now check lpcc_purge process + pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid + succ = LpccService._check_process_by_pidfile(self.lpcc_purge_prog, pidfile) + if succ: + result['purge'] = "running" + else: + result['purge'] = "stopped" + result['error_msg'] = "lpcc_purge is not running!" + + return result + + def is_running(self): + """ + Check the status of PCC, return True if PCC is started, or False + """ + pcc_status = self.status() + if pcc_status.get('status') == "running": + return True + + return False + + def is_stopped(self): + """ + Check the status of PCC, return True if PCC is started, or False + """ + pcc_status = self.status() + if pcc_status.get('status') == "stopped": + return True + + return False + + +class LpccMonitor: + """ + Class to monitor mounted fs and start pcc if it's configurated + """ + + config_obj = None + + def __init__(self, config_file): + try: + with open(config_file, "r") as file_handle: + self.config_obj = yaml.safe_load(file_handle) + # if config_obj is None, it means the config file is empty but still valid + if self.config_obj is None: + eprint("Config file '%s' is empty, the service won't do any real work!" % \ + config_file) + self.config_obj = [] + except FileNotFoundError: + # if config file does not exist, it's the same as an empty config file + eprint("Config file '%s' does not exist, the service won't do any real work!" % \ + config_file) + self.config_obj = [] + else: + if not self._check_config(): + # None means invalid config file or information + self.config_obj = None + + def _check_config(self): + if not isinstance(self.config_obj, list): + eprint("Config information is not valid!") + return False + return True + + def _scan_start_pcc(self): + for lpcc_config in self.config_obj: + lpcc_service = LpccService(lpcc_config) + + if bool(lpcc_config.get('disabled')): + continue + if not os.path.ismount(lpcc_config['mount']): + continue + if lpcc_service.is_stopped(): + lpcc_service.start() + + return 0 + + def _start_pcc(self, request): + count = 0 + response = {} + mount = request.get('mount') + cache = request.get('cache') + + for lpcc_config in self.config_obj: + if mount is not None and mount != lpcc_config['mount']: + continue + if cache is not None and cache != lpcc_config['cache']: + continue + + count = count + 1 + lpcc_service = LpccService(lpcc_config) + if 'disabled' in lpcc_config: + del lpcc_config['disabled'] + if lpcc_service.is_stopped(): + lpcc_service.start() + + if count == 0 and mount is not None: + response['retcode'] = errno.ENOENT + response['error_msg'] = "No matched configuration for mount='%s' cache='%s'" \ + % (mount, cache) + else: + response['retcode'] = 0 + + response['count'] = count + return response + + def _stop_pcc(self, request): + count = 0 + response = {} + mount = request.get('mount') + cache = request.get('cache') + + for lpcc_config in self.config_obj: + if mount is not None and mount != lpcc_config['mount']: + continue + if cache is not None and cache != lpcc_config['cache']: + continue + + count = count + 1 + lpcc_service = LpccService(lpcc_config) + if not lpcc_service.is_stopped(): + lpcc_service.stop() + if not bool(request.get('keep-enabled')): + lpcc_config['disabled'] = True + + if count == 0 and mount is not None: + response['retcode'] = errno.ENOENT + response['error_msg'] = "No matched configuration for mount='%s' cache='%s'" \ + % (mount, cache) + else: + response['retcode'] = 0 + + response['count'] = count + return response + + def _stop_all_pcc(self): + request = {} + request['action'] = 'stop-all' + return self._stop_pcc(request) + + def _status_pcc(self, request): + response = {} + mount = request.get('mount') + cache = request.get('cache') + + status_list = [] + for lpcc_config in self.config_obj: + if mount is not None and mount != lpcc_config['mount']: + continue + if cache is not None and cache != lpcc_config['cache']: + continue + + lpcc_service = LpccService(lpcc_config) + lpcc_status = lpcc_service.status() + if bool(lpcc_config.get('disabled')): + lpcc_status['disabled'] = True + status_list.append(lpcc_status) + + response['retcode'] = 0 + response['status_list'] = status_list + return response + + def _process_cmd(self, request): + response = {} + + if request['action'] == "start" or request['action'] == "start-all": + response = self._start_pcc(request) + elif request['action'] == "stop" or request['action'] == "stop-all": + response = self._stop_pcc(request) + elif request['action'] == 'status' or request['action'] == 'status-all': + response = self._status_pcc(request) + else: + response['retcode'] = -1 + + response['request'] = request + return response + + def _serve_cmd(self): + try: + conn, _ = LISTEN_SOCK.accept() + request_str = conn.makefile().readline() + request = json.loads(request_str) + except Exception as ex: + eprint(ex) + + eprint("Request:", request) + response = self._process_cmd(request) + eprint("Response:", response) + + try: + conn.send(bytes(json.dumps(response), encoding='utf-8')) + conn.close() + except Exception as ex: + eprint(ex) + + def run(self): + """ + Start monitor daemon, scan and start PCC in config file, + monitor /proc/self/mounts and listen on command socket + """ + mounts_fh = open("/proc/self/mounts", "r") + + if os.path.exists(LISTEN_SOCK_FN): + os.unlink(LISTEN_SOCK_FN) + + global LISTEN_SOCK + LISTEN_SOCK = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + LISTEN_SOCK.bind(LISTEN_SOCK_FN) + LISTEN_SOCK.listen(1) + + self._scan_start_pcc() + + while True: + try: + rset, _, eset = select.select([LISTEN_SOCK], [], [mounts_fh]) + except OSError: + break + except ValueError: + break + + if LISTEN_SOCK in rset: + self._serve_cmd() + if mounts_fh in eset: + self._scan_start_pcc() + + eprint("Do cleaning...") + self._stop_all_pcc() + mounts_fh.close() + LISTEN_SOCK.close() + os.unlink(LISTEN_SOCK_FN) + + return 0 + + +class LpccCli: + """ + Class to get command from cli, communicate with monitor, + and show result + """ + + def __init__(self): + pass + + def run_cmd(self, cmd): + """ + Communicate with server and run a sub command + """ + try: + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(LISTEN_SOCK_FN) + except FileNotFoundError: + eprint("Socket file '%s' does not exist, " % LISTEN_SOCK_FN + \ + "please check whether the monitor service started!") + sys.exit(1) + + sock.sendall(bytes(json.dumps(cmd), encoding='utf-8')) + sock.shutdown(socket.SHUT_WR) + response = sock.makefile().readline() + sock.close() + + return json.loads(response) + + +def sigint_handler(signum, frame): + """ + SIGINT handler + """ + #pylint: disable=unused-argument + # close the listen socket to notify the monitor service to exit + eprint("Received signal %s" % signal.Signals(signum).name) + LISTEN_SOCK.close() + +def main(): + """ + main function + """ + parser = argparse.ArgumentParser() + parser.add_argument('--config-file', default='/etc/lpcc.conf', + help='specify the config file') + subparsers = parser.add_subparsers(dest='action') + subparsers.add_parser('monitor', help='start the monitor process') + + start_parser = subparsers.add_parser('start', help=\ + 'start one LPCC of specfied lustre file system and cache dir,' +\ + 'or all LPCCs based on specified lustre file system') + start_parser.add_argument('mount', nargs=1, help=\ + 'the mount point of lustre file system') + start_parser.add_argument('cache', nargs='?', help=\ + 'the cache dir of LPCC') + + stop_parser = subparsers.add_parser('stop', help=\ + 'stop one LPCC of specfied lustre file system and cache dir, ' +\ + 'or all LPCCs based on specified lustre file system') + stop_parser.add_argument('mount', nargs=1, help=\ + 'the mount point of lustre file system') + stop_parser.add_argument('cache', nargs='?', help=\ + 'the cache dir of LPCC') + stop_parser.add_argument('--keep-enabled', action='store_true', help=\ + 'keep the LPCC enabled, which means when the lustre file system is ' +\ + 'mounted again, start all LPCCs based on it') + + status_parser = subparsers.add_parser('status', help=\ + 'get the status of one LPCC of specfied lustre file system and ' +\ + 'cache dir, or all LPCCs based on specified lustre file system') + status_parser.add_argument('mount', nargs=1, help=\ + 'the mount point of lustre file system') + status_parser.add_argument('cache', nargs='?', help=\ + 'the cache dir of LPCC') + + subparsers.add_parser('start-all', help='start all LPCCs') + subparsers.add_parser('stop-all', help='stop all LPCCs') + subparsers.add_parser('status-all', help='get the status of all LPCCs') + + args = parser.parse_args() + + if args.action == 'monitor': + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + monitor = LpccMonitor(args.config_file) + if monitor.config_obj is None: + return 1 + + try: + retcode = monitor.run() + finally: + if os.path.exists(LISTEN_SOCK_FN): + os.unlink(LISTEN_SOCK_FN) + return retcode + + if args.action == 'start' or args.action == 'stop': + request = {} + request['action'] = args.action + request['mount'] = args.mount[0] + request['cache'] = args.cache + if getattr(args, 'keep_enabled', False): + request['keep-enabled'] = True + + response = LpccCli().run_cmd(request) + + print(response) + return response['retcode'] + + if args.action == 'start-all' or args.action == 'stop-all': + request = {} + request['action'] = args.action + + response = LpccCli().run_cmd(request) + + print(response) + return response['retcode'] + + if args.action == 'status': + request = {} + request['action'] = args.action + request['mount'] = args.mount[0] + request['cache'] = args.cache + + response = LpccCli().run_cmd(request) + + print(json.dumps(response['status_list'], indent=4)) + return response['retcode'] + + if args.action == 'status-all': + request = {} + request['action'] = args.action + response = LpccCli().run_cmd(request) + + print(json.dumps(response['status_list'], indent=4)) + return response['retcode'] + + eprint("Type 'lpcc -h' for more information.") + return 1 + if __name__ == "__main__": - lpcc.main() + ret = main() + sys.exit(ret) diff --git a/lipe/lpcc.conf b/lipe/lpcc.conf index 44e90e5..3a19f93 100644 --- a/lipe/lpcc.conf +++ b/lipe/lpcc.conf @@ -1,75 +1,14 @@ -# Configuration file of Lustre Persistent Client Cache Management -# -# Configuration Guide: -# -# $fsname: -# File system name of Lustre -# -# $ssh_hosts: -# $ssh_hosts includes the informations of logining to the server hosts using -# SSH connections. $host_id is the unique ID of the host. Two hosts shouldn't -# share a same $host_id. $hostname is the host name to use when connecting to -# the host using SSH. $host_id and $hostname could be different, because there -# could multiple ways to connect to the same host. $ssh_identity_file is the -# SSH key file used when connecting to the host. $ssh_identity_file could be -# omitted if the default SSH identity file works. -# -# $mds_hosts: -# $mds_hosts includes all the hosts that could be running MDT of this file -# system. Multiple hosts can be configured to support failover. -# "lctl set_param" commands will be run on the MDT to configure the system -# properly for HSM. -# -# $lpcc_readwrite_datasets: -# $lpcc_readwrite_datasets includes all the clients that needs to enable -# readwrite LPCC. $host_id is the host with this client. $archive_id is the -# HSM archive ID reserved for this client. $lpcc_root is the path of the LPCC -# root directory, usually a mounted local file system on SSD. -# $lustre_mount_point is the Lustre client mount point. $client_id is a unique -# ID of the LPCC client. Two LPCC clients shouldn't share a same $client_id. -# -fsname: 969362ae # File system name of Lustre -ssh_hosts: # Array of hosts - - host_id: server17-el7-vm1 # ID of this SSH host - hostname: server17-el7-vm1 # The host name - ssh_identity_file: /root/.ssh/id_dsa # The SSH key to connect to the host - - host_id: server17-el7-vm2 - hostname: server17-el7-vm2 - ssh_identity_file: /root/.ssh/id_dsa - - host_id: server17-el7-vm3 - hostname: server17-el7-vm3 - ssh_identity_file: /root/.ssh/id_dsa -mds_hosts: # Array of hosts that could have MDTs - - host_id: server17-el7-vm1 # ID of the host running MDS -lustre_clients: - - host_id: server17-el7-vm2 # ID of the host running client - lustre_mount_point: /mnt/lustre # Lustre mount point - client_id: server17-el7-vm2 # ID of this Lustre client - - host_id: server17-el7-vm3 - lustre_mount_point: /mnt/lustre - client_id: server17-el7-vm3 -lpcc_readwrite_datasets: # Array of client with readwrite LPCC - - client_id: server17-el7-vm2 # ID of the Lustre client - archive_id: 1 # Archive number - lpcc_root: /mnt/lpcc # LPCC root path - project_id: 100 # Project ID of new files for automatic caching - dataset_id: server17-el7-vm2_lpcc_rw # ID of this LPCC dataset - - client_id: server17-el7-vm3 - archive_id: 2 - lpcc_root: /mnt/lpcc - project_id: 101 - dataset_id: server17-el7-vm3_lpcc_rw -lpcc_readonly_dataset_groups: # Array of client with readonly LPCC - - group_id: 3 # Group ID - lpcc_root: /mnt/lpcc_ro_g3 # LPCC root path - project_id: 10 # Project ID of new files for automatic caching - lustre_clients: # Array of Lustre clients to enable this dataset - - client_id: server17-el7-vm2 # ID of Lustre client - - client_id: server17-el7-vm3 - - group_id: 4 - lpcc_root: /mnt/lpcc_ro_g4 - lustre_mount_point: /mnt/lustre - project_id: 11 - lustre_clients: - - client_id: server17-el7-vm2 - - client_id: server17-el7-vm3 +#- mount: /mnt/lfs +# cache: /mnt/pcc +# roid: 2 +# autocache: projid={500 1000}&fname={*.h5},uid={1001} +# purge: +# high_usage: 90 +# low_usage: 75 +# scan_threads: 1 +# interval: 30 +# +#- mount: /mnt/lfs +# cache: /mnt/pcc2 +# roid: 3 +# autocache: projid={500} diff --git a/lipe/lpcc_cleanup b/lipe/lpcc_cleanup deleted file mode 100755 index c283a7a..0000000 --- a/lipe/lpcc_cleanup +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/python2 -u -# Copyright (c) 2017 DataDirect Networks, Inc. -# All Rights Reserved. -# Author: lixi@ddn.com -""" -LPCC(Lustre Persistent Client Cache) Cleanup -""" -from pylpcc import lpcc_cleanup - -if __name__ == "__main__": - lpcc_cleanup.main() diff --git a/lipe/lpcc_test b/lipe/lpcc_test deleted file mode 100755 index 34e34a9..0000000 --- a/lipe/lpcc_test +++ /dev/null @@ -1,11 +0,0 @@ -#!/usr/bin/python2 -u -# Copyright (c) 2017 DataDirect Networks, Inc. -# All Rights Reserved. -# Author: lixi@ddn.com -""" -Tests for LPCC(Lustre Persistent Cache Management) -""" -from pylpcc import lpcc_test - -if __name__ == "__main__": - lpcc_test.main() diff --git a/lipe/man/lpcc-start.8 b/lipe/man/lpcc-start.8 new file mode 100644 index 0000000..661b3e3 --- /dev/null +++ b/lipe/man/lpcc-start.8 @@ -0,0 +1,26 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved. +.\" This file may be copied under the terms of the GNU Public License, v2. +.\" +.TH lpcc-start 8 "2021 Jul 7" Lustre "configuration utilities" + +.SH NAME +lpcc-start - lpcc start sub command + +.SH SYNOPSIS +.BI "lpcc start MOUNT_POINT [CACHE_DIR]" +.PP +.BI "lpcc start-all" +.PP + +.SH DESCRIPTION +Start a specific LPCC if both \fBMOUNT_POINT\fR and \fBCAHCE_DIR\fR +are specified. +Start all LPCCs based on a specific Lustre file system if only +\fBMOUNT_POINT\fR are specified. +\fBstart_all\fR sub command starts all LPCCs in config file. +.PP +.SH "SEE ALSO" +.BR lpcc(8) +.BR lpcc-stop(8) +.BR lpcc-status(8) diff --git a/lipe/man/lpcc-status.8 b/lipe/man/lpcc-status.8 new file mode 100644 index 0000000..ac212dc --- /dev/null +++ b/lipe/man/lpcc-status.8 @@ -0,0 +1,26 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved. +.\" This file may be copied under the terms of the GNU Public License, v2. +.\" +.TH lpcc-status 8 "2021 Jul 7" Lustre "configuration utilities" + +.SH NAME +lpcc-status - lpcc status sub command + +.SH SYNOPSIS +.BI "lpcc status MOUNT_POINT [CACHE_DIR]" +.PP +.BI "lpcc status-all" +.PP + +.SH DESCRIPTION +Get status of a specific LPCC if both \fBMOUNT_POINT\fR and \fBCAHCE_DIR\fR +are specified. +Get status all LPCCs based on a specific Lustre file system if only +\fBMOUNT_POINT\fR are specified. +\fBstatus_all\fR sub command get the status of all LPCCs in config file. +.PP +.SH "SEE ALSO" +.BR lpcc(8) +.BR lpcc-start(8) +.BR lpcc-stop(8) diff --git a/lipe/man/lpcc-stop.8 b/lipe/man/lpcc-stop.8 new file mode 100644 index 0000000..cb26524 --- /dev/null +++ b/lipe/man/lpcc-stop.8 @@ -0,0 +1,34 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved. +.\" This file may be copied under the terms of the GNU Public License, v2. +.\" +.TH lpcc-stop 8 "2021 Jul 7" Lustre "configuration utilities" + +.SH NAME +lpcc-stop - lpcc stop sub command + +.SH SYNOPSIS +.BI "lpcc stop MOUNT_POINT [CACHE_DIR] [OPTIONS]" +.PP +.BI "lpcc stop-all" +.PP + +.SH DESCRIPTION +Stop a specific LPCC if both \fBMOUNT_POINT\fR and \fBCAHCE_DIR\fR +are specified. +Stop all LPCCs based on a specific Lustre file system if only +\fBMOUNT_POINT\fR are specified. +\fBstop_all\fR sub command stops all LPCCs in config file. +.PP +.SH OPTIONS +.TP +.BR --keep-enabled +used by +.BR umount.lustre (8) +to notify monitor daemon that the LPCC should be started again if the lustre +file system is mounted again. + +.SH "SEE ALSO" +.BR lpcc(8) +.BR lpcc-start(8) +.BR lpcc-status(8) diff --git a/lipe/man/lpcc.8 b/lipe/man/lpcc.8 new file mode 100644 index 0000000..c32f609 --- /dev/null +++ b/lipe/man/lpcc.8 @@ -0,0 +1,61 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved. +.\" This file may be copied under the terms of the GNU Public License, v2. +.\" +.TH lpcc 8 "2021 Jul 7" Lustre "configuration utilities" + +.SH NAME +lpcc - Management tool for Lustre Persistent Client Cache (LPCC) + +.SH SYNOPSIS +.BI "lpcc -h|--help" +.PP +.BI "lpcc SUBCMD ARGS" +.PP + +.SH DESCRIPTION +To start/stop Lustre Persistent Client Cache (LPCC), there is a series of +commands to be run correctly with consistent parameters. If there are multiple +LPCCs on a client, it is even more complex. +.PP +.TP +The \fBlpcc\fR tool helps to: +.br +\(bu configurate all the LPCCs in single file +.br +\(bu start/stop LPCCs automatically when system boots up/shutdown +.br +\(bu monitor the mounting/umounting of lustre file system and start/stop +LPCCs based on the file system +.br +\(bu start/stop specific LPCC manually + +.PP +To use \fBlpcc\fR tool, first prepare a configuration file. +The file is \fB/etc/lpcc.conf\fR by default. +.PP +Then start the monitor daemon. Usually this work is done by a wrapper +systemd service \fBlpcc.service\fR. +It is highly recommended to enable the service so that it is started +automatically when the system boots up. +.PP +When the monitor daemon starts, it checks all the LPCCs. If the base lustre +file system of any LPCC has been mounted, the LPCC will be started +automatically. +.PP +If a lustre file system is mounted later, the monitor daemon checks any LPCC +based on that file system and starts it. If a lustre file system is unmounted, +\fBumount.lustre (8)\fR checks any LPCC based on that file system, and stops +it before doing the real umounting. +.PP +While the monitor daemon is running, user can manually start/stop one specific +LPCC by \fImount_point\fR and \fIcache_dir\fR, or all LPCCs based on a +specific \fImount_point\fR. +.PP +All these LPCCs will be stopped when the monitor daemon stops. +.SH "SEE ALSO" +.BR lpcc.conf(5) +.BR lpcc-start(8) +.BR lpcc-stop(8) +.BR lpcc-status(8) +.BR lctl-pcc(8) diff --git a/lipe/man/lpcc.conf.5 b/lipe/man/lpcc.conf.5 new file mode 100644 index 0000000..8df803f --- /dev/null +++ b/lipe/man/lpcc.conf.5 @@ -0,0 +1,137 @@ +.\" -*- nroff -*- +.\" Copyright (c) 2021, DDN and/or its affiliates. All rights reserved. +.\" This file may be copied under the terms of the GNU Public License, v2. +.\" +.TH lpcc.conf 5 "2021 Jul 7" Lustre "File Formats Manual" + +.SH NAME +lpcc.conf - configuration file for lpcc systemd service + +.SH DESCRIPTION +The file \fB/etc/lpcc.conf\fR contains a list of Lustre Persistent Client Cache +(LPCC). The whole file is an array in YAML. Each element of the array +is the configuration of a LPCC. +For each LPCC, the configuration is a dictionary with these items: +.PP +.TP +.BR mount +The mount point of lustre file system to be cached +.TP +.BR cache +The dir for cached file +.TP +.BR roid +The id of LPCC. It is a positive interger and must be unique on a single client. +.TP +.BR autocache +The condition to cache file automatically. +.TP +.BR purge +More configuration for lpcc_purge daemon. Since all the sub items under it have +default value, this item is not necessary if it has no explicit sub item. +.TP +.BR purge.high_usage +If the disk usage of cache device is higher than \fBpurge.high_usage\fR, start +purging. It is 90 (means 90% disk/inode useage) by default. +.TP +.BR purge.low_usage +If the disk usage of cache device is lower than \fBpurge.low_usage\fR, stop +purging. It is 75 (means 75% disk/inode usage) by default. +.TP +.BR purge.interval +The interval for lpcc_purge to check cache device usage, in seconds. It is 5 +seconds by default. +.TP +.BR purge.scan_threads +How many threads are used to scan cache device in parallel. It is 1 thread by +default. + +.SH AUTOCACHE CONDITION +When a file in lustre file system is opened, the autocache condition will be +checked against the file. If the condition is true, the file will be cached in +the cache device automatically. +.PP +The rule is either a single compare expression, or several compare expressions +connected with '&' or ','. Here '&' is logical opearator AND, ',' is logical +operator OR. '&' has a higher priority than ','. +.PP +Each compare expression has 3 parts: attribute, opeartor and target. +Attribute is the attribute of file to be checked, e.g., projid, fname. +Operator is '=', '<' or '>'. Target is either a single value or a value group +(several single values separated by blankspace ' '), embraced by a pair of '{}'. +.PP +For example: +.PP +.TP +.BR projid={500} +projid is exactly 500. +.TP +.BR "projid={500 1000}" +projid is either 500 or 1000. +.TP +.BR "projid={500 1000}&fname={*.h5},uid={1001}" +Either case of: +(a) projid is 500 or 1000 and fname matches *.h5; +(b) process uid is exactly 1001. +.PP +These attributes are supported in compare expression: +.PP +.TP +.BR projid +The project id of file. It supports '=', '<' and '>' operators. '=' operator +supports single value or value group. '<' and '>' support only single value. +Each single value should be a number. +.TP +.BR fname +The base name of file. It supports only '=' operator. +Both single value and value group are supported. +Each single value should be a precise file name string, or a pattern including +wildchar '*'. +.TP +.BR uid +The uid of the process to access the file. It supports '=', '<' and '>' +operators. '=' operator supports single value or value group. '<' and '>' +support only single value. +Each single value should be a number. +.TP +.BR gid +The gid of process to access the file. Similar to uid. +.TP +.BR size +The size of file. It supports '=', '<' and '>' operators. '=' operator supports +single value or value group. '<' and '>' support only single value. +Each single value should be a number, or a number with unit. +The unit could be: K, M, G, T, P, E. +.TP +.BR mtime +The mtime of file. Actually it means the age, that is to say, the seconds of +(current - mtime). It supports '=', '<' and '>' operators. '=' operator supports +single value or value group. '<' and '>' support only single value. +Each single value should be a number in seconds, or a number wiht unit. +The unit could be: m(minute), h(hour), d(day), w(week), y(year). Here 1 year is +exactly 52 weeks. +For example, "mtime>{10m}" means the file was modified more than 10 minutes ago; +"mtime<{1h30}" means the file was modified less than 1 hour and 30 seconds ago. + +.SH EXAMPLES +.EX +# sample /etc/lpcc.conf +- mount: /mnt/lfs + cache: /mnt/pcc + roid: 2 + autocache: projid={500 1000}&fname={*.h5},uid={1001} + purge: + high_usage: 85 + low_usage: 70 + scan_threads: 3 + interval: 10 +- mount: /mnt/lfs2 + cache: /mnt/pcc2 + roid: 3 + autocache: projid={500} +.EE +.SH "SEE ALSO" +.BR lpcc(8) +.BR lpcc-start(8) +.BR lpcc-stop(8) +.BR lpcc-status(8) \ No newline at end of file diff --git a/lipe/pylpcc/__init__.py b/lipe/pylpcc/__init__.py deleted file mode 100644 index 9900853..0000000 --- a/lipe/pylpcc/__init__.py +++ /dev/null @@ -1,6 +0,0 @@ -""" -Python library for LPCC -""" -__all__ = ["lpcc", - "lpcc_cleanup", - "lpcc_test"] diff --git a/lipe/pylpcc/lpcc.py b/lipe/pylpcc/lpcc.py deleted file mode 100644 index 2f146ba..0000000 --- a/lipe/pylpcc/lpcc.py +++ /dev/null @@ -1,653 +0,0 @@ -# Copyright (c) 2017 DataDirect Networks, Inc. -# All Rights Reserved. -# Author: lixi@ddn.com - -""" -Library for managing LPCC(Lustre Persistent Client Cache) -""" - -import sys -import traceback -import os -import shutil -import signal -import time -import re -import filelock -import yaml - -# Local libs -from pylustre import lustre -from pylustre import time_util -from pylustre import utils -from pylustre import daemon -from pylustre import hsm -from pylustre import clog - -LPCC_CONFIG_FNAME = "lpcc.conf" -LPCC_CONFIG = "/etc/" + LPCC_CONFIG_FNAME -LPCC_LOG_DIR = "/var/log/lpcc" -LPCC_STATE_PATTERN = (r"^type: (?P\S+), " - r"PCC file: (?P\S+), " - r"user number: (?P\S+), " - r"attr cached: (?P\S+)$") -LPCC_STATE_REGULAR = re.compile(LPCC_STATE_PATTERN) -LPCC_TYPE_NONE = "none" -LPCC_TYPE_READONLY = "readonly" -LPCC_TYPE_READWRITE = "readwrite" - -STR_SSH_HOSTS = "ssh_hosts" -STR_HOSTNAME = "hostname" -STR_HOST_ID = "host_id" -STR_SSH_IDENTITY_FILE = "ssh_identity_file" -STR_MDS_HOSTS = "mds_hosts" -STR_LUSTRE_CLIENTS = "lustre_clients" -STR_LUSTRE_MOUNT_POINT = "lustre_mount_point" -STR_CLIENT_ID = "client_id" -STR_LPCC_READWRITE_DATASETS = "lpcc_readwrite_datasets" -STR_DATASET_ID = "dataset_id" -STR_LPCC_ROOT = "lpcc_root" -STR_GROUP_ID = "group_id" -STR_ROOT = "root" -STR_ARCHIVE_ID = "archive_id" -STR_PROJECT_ID = "project_id" -STR_LPCC_READONLY_DATASET_GROUPS = "lpcc_readonly_dataset_groups" - - -def usage(): - """ - Print usage string - """ - utils.oprint("Usage: %s " % sys.argv[0]) - - -class LPCCDataset(object): - """ - Each SSH host has an object of this type - """ - # pylint: disable=too-few-public-methods,too-many-arguments,too-many-instance-attributes - def __init__(self, client_name, pcc_type, root, set_id, projid, - lustre_client): - self.lpccd_lustre_client = lustre_client - self.lpccd_host = lustre_client.lc_host - self.lpccd_id = set_id - self.lpccd_projid = projid - self.lpccd_root = root - self.lpccd_client_name = client_name - self.lpccd_pcc_type = pcc_type - self.lpccd_lustre_mnt = lustre_client.lc_mnt - - def lpccd_stop(self, log): - """ - Delete this dataset from LPCC - """ - command = ("echo -n 'del %s' > /proc/fs/lustre/llite/%s/pcc" % - (self.lpccd_root, self.lpccd_client_name)) - retval = self.lpccd_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - self.lpccd_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - return 0 - - def lpccd_start(self, log): - """ - Add this dataset to LPCC - """ - command = ("echo -n 'add %s %s %s' > /proc/fs/lustre/llite/%s/pcc" % - (self.lpccd_root, self.lpccd_id, self.lpccd_projid, - self.lpccd_client_name)) - retval = self.lpccd_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - self.lpccd_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - return 0 - - -class LPCCRwDataset(LPCCDataset): - """ - Each readwrite PCC client has an object of this type - """ - # pylint: disable=too-few-public-methods,too-many-instance-attributes - # pylint: disable=too-many-arguments - def __init__(self, log, dataset_id, archive_id, project_id, lrwd_root, - parent_directory, fsname, have_raolu, mdts, client_name, - lustre_client): - super(LPCCRwDataset, self).__init__(client_name, - LPCC_TYPE_READWRITE, - lrwd_root, archive_id, - project_id, - lustre_client) - self.lrwd_workspace = parent_directory + "/" + dataset_id - self.lrwd_copytool = hsm.HSMCopytool("copytool", lustre_client.lc_host, archive_id, - lrwd_root, lustre_client.lc_mnt, - self.lrwd_workspace) - self.lrwd_parent_directory = parent_directory - self.lrwd_have_raolu = have_raolu - self.lrwd_removers = [] - if not have_raolu: - for mdt in mdts: - remover_id = "remover_" + mdt.ls_index_string - remover = hsm.HSMRemover(log, remover_id, lustre_client.lc_host, fsname, mdt, - lrwd_root, self.lrwd_workspace) - self.lrwd_removers.append(remover) - - def lrwd_killall(self, log): - """ - Kill all the process of this LPCC client - """ - self.lrwd_copytool.hc_killall(log) - for remover in self.lrwd_removers: - remover.hr_killall(log) - return 0 - - def lpccd_stop(self, log): - """ - Stop all the process of this LPCC client - """ - self.lrwd_killall(log) - self.lrwd_copytool.hc_thread.join() - for remover in self.lrwd_removers: - remover.hr_thread.join() - remover.hr_fini(log) - return super(LPCCRwDataset, self).lpccd_stop(log) - - def lpccd_start(self, log): - """ - Start all the process of this LPCC client - """ - ret = utils.mkdir(self.lrwd_workspace) - if ret: - log.cl_error("failed to create directory [%s] on local host", - self.lrwd_workspace) - return -1 - - self.lrwd_killall(log) - ret = super(LPCCRwDataset, self).lpccd_start(log) - if ret: - log.cl_error("failed to add dataset [%s] of mnt [%s] on host " - "[%s]", self.lpccd_root, self.lpccd_host.sh_hostname, - self.lpccd_lustre_mnt) - return -1 - - for remover in self.lrwd_removers: - ret = remover.hr_thread_start(log) - if ret: - log.cl_error("failed to start remover thread") - return -1 - ret = self.lrwd_copytool.hc_thread_start(log) - if ret: - log.cl_error("failed to start copytool thread for dataset [%s] of " - "mnt [%s] on host [%s]", self.lpccd_root, - self.lpccd_host.sh_hostname, self.lpccd_lustre_mnt) - return -1 - - return 0 - - -def find_lpcc_dataset_from_id(datasets, set_id): - """ - Find LPCC from archive ID - """ - # pylint: disable=unused-variable - for dataset in datasets: - if dataset.lpccd_id == set_id: - return dataset - return None - - -class LPCCManager(object): - """ - Each SSH host has an object of this type - """ - # pylint: disable=too-few-public-methods,too-many-arguments,too-many-instance-attributes - def __init__(self, workspace, config_fpath): - self.lm_rw_dataset_dict = {} - self.lm_rw_datasets = [] - self.lm_ro_dataset_groups = {} - self.lm_ro_datasets = [] - self.lm_lustre_clients = {} - self.lm_workspace = workspace - self.lm_config_fpath = config_fpath - self.lm_fsname = None - self.lm_hosts = {} - self.lm_mdt_hosts = [] - - def lm_parse(self, log): - """ - Parse the configuration - """ - # pylint: disable=bare-except,too-many-locals,too-many-return-statements - # pylint: disable=too-many-branches,too-many-statements,unused-variable - config_fd = open(self.lm_config_fpath) - ret = 0 - try: - config = yaml.load(config_fd) - except: - log.cl_error("not able to load [%s] as yaml file: %s", - self.lm_config_fpath, traceback.format_exc()) - ret = -1 - config_fd.close() - if ret: - return -1 - - fsname = config["fsname"] - host_configs = config[STR_SSH_HOSTS] - for host_config in host_configs: - hostname = host_config[STR_HOSTNAME] - host_id = host_config[STR_HOST_ID] - if STR_SSH_IDENTITY_FILE in host_config: - ssh_identity_file = host_config[STR_SSH_IDENTITY_FILE] - else: - ssh_identity_file = None - if host_id in self.lm_hosts: - log.cl_error("multiple hosts with the same ID [%s]", host_id) - return -1 - host = lustre.LustreServerHost(hostname, - identity_file=ssh_identity_file, - host_id=host_id) - self.lm_hosts[host_id] = host - - mds_configs = config[STR_MDS_HOSTS] - for mds_config in mds_configs: - mds_host_id = mds_config[STR_HOST_ID] - if mds_host_id not in self.lm_hosts: - log.cl_error("no host with ID [%s] is configured", host_id) - return -1 - host = self.lm_hosts[mds_host_id] - self.lm_mdt_hosts.append(host) - - have_raolu = True - mdts = [] - for host in self.lm_mdt_hosts: - tmp_clients = {} - tmp_osts = {} - tmp_mdts = {} - ret = host.lsh_lustre_detect_services(tmp_clients, tmp_osts, tmp_mdts) - if ret: - log.cl_error("failed to detect services on host [%s]", - host.sh_hostname) - return -1 - for mdt_index, mdt in tmp_mdts.iteritems(): - if mdt.lsi_service.ls_lustre_fs.lf_fsname != fsname: - continue - mdts.append(mdt) - ret = mdt.mdti_enable_hsm_control(log) - if ret: - return -1 - - ret = mdt.mdti_enable_raolu(log) - if ret < 0: - return -1 - elif ret == 1: - have_raolu = False - - lustre_client_configs = config[STR_LUSTRE_CLIENTS] - for lustre_client_config in lustre_client_configs: - host_id = lustre_client_config[STR_HOST_ID] - if host_id not in self.lm_hosts: - log.cl_error("no host with ID [%s] is configured", host_id) - return -1 - host = self.lm_hosts[host_id] - - lustre_mount_point = lustre_client_config[STR_LUSTRE_MOUNT_POINT] - client_id = lustre_client_config[STR_CLIENT_ID] - if client_id in self.lm_lustre_clients: - log.cl_error("multiple Lustre client with the same ID [%s]", - client_id) - return -1 - lustre_fs = lustre.LustreFilesystem(fsname) - lustre_client = lustre.LustreClient(log, lustre_fs, host, lustre_mount_point) - self.lm_lustre_clients[client_id] = lustre_client - - if STR_LPCC_READWRITE_DATASETS in config: - lpcc_rw_dataset_configs = config[STR_LPCC_READWRITE_DATASETS] - else: - lpcc_rw_dataset_configs = [] - - for lpcc_rw_dataset_config in lpcc_rw_dataset_configs: - client_id = lpcc_rw_dataset_config[STR_CLIENT_ID] - if client_id not in self.lm_lustre_clients: - log.cl_error("no Lustre client with ID [%s] is configured", client_id) - return -1 - lustre_client = self.lm_lustre_clients[client_id] - lustre_mount_point = lustre_client.lc_mnt - - host = lustre_client.lc_host - dataset_id = lpcc_rw_dataset_config[STR_DATASET_ID] - if dataset_id in self.lm_rw_dataset_dict: - log.cl_error("multiple LPCC client with the same ID [%s]", - dataset_id) - return -1 - lpcc_root = lpcc_rw_dataset_config[STR_LPCC_ROOT] - archive_id = lpcc_rw_dataset_config[STR_ARCHIVE_ID] - lpcc_rw_dataset = find_lpcc_dataset_from_id(self.lm_rw_datasets, archive_id) - if lpcc_rw_dataset is not None: - log.cl_error("multiple LPCC client with the same archive ID [%s]", - archive_id) - return -1 - - project_id = lpcc_rw_dataset_config[STR_PROJECT_ID] - - client_name = host.lsh_getname(log, lustre_mount_point) - if client_name is None: - log.cl_error("failed to get client name of path [%s] on host " - "[%s]", lustre_mount_point, host.sh_hostname) - return -1 - if not client_name.startswith(fsname + "-"): - log.cl_error("client name [%s] of path [%s] on host [%s] " - "doesn't have expected fsname [%s] ", client_name, - lustre_mount_point, host.sh_hostname, fsname) - return -1 - lpcc_rw_dataset = LPCCRwDataset(log, dataset_id, archive_id, project_id, - lpcc_root, self.lm_workspace, - fsname, have_raolu, mdts, client_name, - lustre_client) - self.lm_rw_dataset_dict[dataset_id] = lpcc_rw_dataset - self.lm_rw_datasets.append(lpcc_rw_dataset) - - if STR_LPCC_READONLY_DATASET_GROUPS in config: - group_configs = config[STR_LPCC_READONLY_DATASET_GROUPS] - else: - group_configs = [] - for group_config in group_configs: - lpcc_root = group_config[STR_LPCC_ROOT] - group_id = group_config[STR_GROUP_ID] - if group_id in self.lm_ro_dataset_groups: - log.cl_error("multiple LPCC readonly group with the same group ID [%s]", - group_id) - return -1 - - project_id = group_config[STR_PROJECT_ID] - client_configs = group_config[STR_LUSTRE_CLIENTS] - group_datasets = [] - for client_config in client_configs: - client_id = client_config[STR_CLIENT_ID] - if client_id not in self.lm_lustre_clients: - log.cl_error("no Lustre client with ID [%s] is configured", client_id) - return -1 - lustre_client = self.lm_lustre_clients[client_id] - - fsname = lustre_client.lc_lustre_fs.lf_fsname - host = lustre_client.lc_host - lustre_mount_point = lustre_client.lc_mnt - client_name = host.lsh_getname(log, lustre_mount_point) - if client_name is None: - log.cl_error("failed to get client name of path [%s] on host " - "[%s]", lustre_mount_point, host.sh_hostname) - return -1 - if not client_name.startswith(fsname + "-"): - log.cl_error("client name [%s] of path [%s] on host [%s] " - "doesn't have expected fsname [%s] ", client_name, - lustre_mount_point, host.sh_hostname, fsname) - return -1 - - dataset = LPCCDataset(client_name, LPCC_TYPE_READONLY, - lpcc_root, group_id, project_id, - lustre_client) - self.lm_ro_datasets.append(dataset) - group_datasets.append(dataset) - self.lm_ro_dataset_groups[group_id] = group_datasets - - def lm_start(self, log): - """ - Start LPCC manager - """ - # pylint: disable=unused-variable - for host_id, host in self.lm_hosts.iteritems(): - clients = lustre.detect_lustre_clients(log, host) - for client in clients: - ret = lpcc_dataset_stop(log, host, client.lc_mnt) - if ret: - return -1 - - for dataset in self.lm_rw_datasets: - ret = dataset.lpccd_start(log) - if ret: - log.cl_error("failed to start readwrite dataset") - return -1 - - for dataset in self.lm_ro_datasets: - ret = dataset.lpccd_start(log) - if ret: - log.cl_error("failed to start readonly dataset") - return -1 - - return 0 - - def lm_stop(self, log): - """ - Stop LPCC manager - """ - for lpcc_rw_dataset in self.lm_rw_datasets: - lpcc_rw_dataset.lpccd_stop(log) - - for lpcc_rw_dataset in self.lm_ro_datasets: - lpcc_rw_dataset.lpccd_stop(log) - - -def manage_lpcc_locked(log, workspace, config_fpath): - """ - Manage LPCC clients holding the lock - """ - manager = LPCCManager(workspace, config_fpath) - ret = manager.lm_parse(log) - if ret: - return ret - - ret = manager.lm_start(log) - if ret: - return ret - - while not daemon.SHUTTING_DOWN: - time.sleep(1) - - ret = manager.lm_stop(log) - - return 0 - - -def manage_lpcc(log, workspace, config_fpath): - """ - Manage LPCC clients - """ - # pylint: disable=bare-except - lock_file = config_fpath + ".lock" - lock = filelock.FileLock(lock_file) - try: - with lock.acquire(timeout=0): - try: - ret = manage_lpcc_locked(log, workspace, config_fpath) - except: - ret = -1 - log.cl_error("exception: %s", traceback.format_exc()) - lock.release() - except filelock.Timeout: - ret = -1 - log.cl_error("someone else is holding lock of file [%s], aborting " - "to prevent conflicts", lock_file) - return ret - - -def lpcc_dataset_list(log, host, lustre_mount_point): - """ - List the datasets on a Lustre mount point - """ - # pylint: disable=too-many-locals - client_name = host.lsh_getname(log, lustre_mount_point) - if client_name is None: - return None - - fsname = client_name.split('-')[0] - lustre_fs = lustre.LustreFilesystem(fsname) - lustre_client = lustre.LustreClient(log, lustre_fs, host, lustre_mount_point) - - command = ("cat /proc/fs/lustre/llite/%s/pcc" % client_name) - retval = host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return None - - dataset_pattern = (r"^(?P\S+) (?P\S+) (?P\S+)$") - dataset_regular = re.compile(dataset_pattern) - datasets = [] - for line in retval.cr_stdout.splitlines(): - log.cl_debug("parsing line [%s] to get dataset", line) - match = dataset_regular.match(line) - if match: - root = match.group("root") - archive_id = match.group(STR_ARCHIVE_ID) - projid = match.group("projid") - dataset = LPCCDataset(client_name, LPCC_TYPE_NONE, root, - archive_id, projid, lustre_client) - datasets.append(dataset) - log.cl_debug("LPCC dataset [%s] configured on dir [%s] of host " - "[%s]", root, lustre_mount_point, host.sh_hostname) - else: - reason = ("failed to parse line [%s] to get dataset" % line) - log.cl_error(reason) - raise Exception(reason) - return datasets - - -def lpcc_dataset_stop(log, host, lustre_mount_point): - """ - Stop the datasets on a Lustre mount point - """ - datasets = lpcc_dataset_list(log, host, lustre_mount_point) - for dataset in datasets: - ret = dataset.lpccd_stop(log) - if ret: - return ret - return 0 - - -class PCCState(object): - """ - The HSM state - """ - # pylint: disable=too-few-public-methods - def __init__(self, pcc_type, pcc_file=None, user_number=None, attr_cached=None): - self.ps_type = pcc_type - self.ps_pcc_file = pcc_file - self.ps_user_number = user_number - self.ps_attr_cached = attr_cached - - -def lfs_pcc_state(log, fpath, host=None): - """ - PCC state - """ - command = ("lfs pcc_state %s" % (fpath)) - extra_string = "" - if host is None: - retval = utils.run(command) - else: - retval = host.sh_run(log, command) - extra_string = ("on host [%s]" % host.sh_hostname) - if retval.cr_exit_status != 0: - log.cl_error("failed to run command [%s]%s, " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, extra_string, - retval.cr_exit_status, retval.cr_stdout, - retval.cr_stderr) - return None - - file_part = "file: %s, " % fpath - - output = retval.cr_stdout.strip() - if not output.startswith(file_part): - log.cl_error("unexpected output [%s]", output) - return None - - fpath_len = len(file_part) - output = output[fpath_len:] - - type_none = "type: none" - if output == type_none: - return PCCState("none") - - match = LPCC_STATE_REGULAR.match(output) - if not match: - log.cl_error("output [%s] doesn't mather pattern [%s]", - output, LPCC_STATE_PATTERN) - return None - - pcc_type = match.group("pcc_type") - pcc_file = match.group("pcc_file") - user_number = match.group("user_number") - attr_cached = match.group("attr_cached") - return PCCState(pcc_type, pcc_file=pcc_file, user_number=user_number, - attr_cached=attr_cached) - - -def main(): - """ - Run LPCC manager - """ - # pylint: disable=unused-variable,not-callable - if sys.version[0] == '2': - reload(sys) - if hasattr(sys, "setdefaultencoding"): - set_encoding = getattr(sys, "setdefaultencoding", None) - set_encoding('UTF-8') - else: - os.environ["PYTHONIOENCODING"] = 'UTF-8' - config_fpath = LPCC_CONFIG - - if len(sys.argv) == 2: - config_fpath = sys.argv[1] - elif len(sys.argv) > 2: - usage() - sys.exit(-1) - - identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S") - workspace = LPCC_LOG_DIR + "/" + identity - - if not os.path.exists(LPCC_LOG_DIR): - ret = utils.mkdir(LPCC_LOG_DIR) - if ret: - utils.eprint("failed to create directory [%s]" % LPCC_LOG_DIR) - sys.exit(-1) - elif not os.path.isdir(LPCC_LOG_DIR): - utils.eprint("[%s] is not a directory" % LPCC_LOG_DIR) - sys.exit(-1) - - if not os.path.exists(workspace): - ret = utils.mkdir(workspace) - if ret: - utils.eprint("failed to create directory [%s]" % workspace) - sys.exit(-1) - elif not os.path.isdir(workspace): - utils.eprint("[%s] is not a directory" % workspace) - sys.exit(-1) - - log = clog.get_log(resultsdir=workspace) - log.cl_info("started LPCC manager using config [%s], please check [%s] for " - "more log" % (config_fpath, workspace)) - signal.signal(signal.SIGINT, daemon.signal_handler) - signal.signal(signal.SIGTERM, daemon.signal_handler) - - save_fpath = workspace + "/" + LPCC_CONFIG_FNAME - log.cl_debug("copying config file from [%s] to [%s]", config_fpath, - save_fpath) - shutil.copyfile(config_fpath, save_fpath) - ret = manage_lpcc(log, workspace, config_fpath) - sys.exit(ret) diff --git a/lipe/pylpcc/lpcc_cleanup.py b/lipe/pylpcc/lpcc_cleanup.py deleted file mode 100644 index 42c30ef..0000000 --- a/lipe/pylpcc/lpcc_cleanup.py +++ /dev/null @@ -1,117 +0,0 @@ -# Copyright (c) 2017 DataDirect Networks, Inc. -# All Rights Reserved. -# Author: lixi@ddn.com - -""" -Library for cleanup LPCC(Lustre Persistent Client Cache) storage -""" - -import sys -import os -import getopt - -# Local libs -from pylustre import utils -from pylustre import time_util -from pylustre import clog -from pylustre import hsm_check - -LPCC_CLEANUP_LOG_DIR = "/var/log/lpcc_cleanup" - - -def usage(): - """ - Print usage string - """ - utils.oprint("Usage: %s <--pcc_root pcc_root> <--lustre_mnt lustre_mnt> " % - sys.argv[0]) - - -def lfs_pcc_detach_fid(log, lustre_mnt, fid): - """ - Transfer FID to fpath - """ - command = ("lfs pcc_detach_fid %s %s" % (lustre_mnt, fid)) - retval = utils.run(command) - if retval.cr_exit_status != 0: - log.cl_error("failed to run command [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - retval.cr_exit_status, retval.cr_stdout, - retval.cr_stderr) - return None - - return retval.cr_exit_status - - -def pcc_fid_detach(log, lustre_mnt, fid_name): - """ - Detach FID exists from PCC - """ - ret = lfs_pcc_detach_fid(log, lustre_mnt, fid_name) - if ret: - log.cl_error("failed to detach FID [%s] on Lustre file system [%s] from PCC", - fid_name, lustre_mnt) - return -1 - return 0 - - -def main(): - """ - Cleanup LPCC - """ - # pylint: disable=unused-variable,not-callable - if sys.version[0] == '2': - reload(sys) - if hasattr(sys, "setdefaultencoding"): - set_encoding = getattr(sys, "setdefaultencoding", None) - set_encoding('UTF-8') - else: - os.environ["PYTHONIOENCODING"] = 'UTF-8' - - options, remainder = getopt.getopt(sys.argv[1:], - "h", - ["help", - "pcc_root=", - "lustre_mnt="]) - - pcc_root = None - lustre_mnt = None - for opt, arg in options: - if opt == "--pcc_root": - pcc_root = arg.rstrip('/') - elif opt == "--lustre_mnt": - lustre_mnt = arg - elif opt == '-h' or opt == "--help": - usage() - sys.exit(0) - if pcc_root is None or lustre_mnt is None: - usage() - sys.exit(-1) - - identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S") - workspace = LPCC_CLEANUP_LOG_DIR + "/" + identity - - if not os.path.exists(LPCC_CLEANUP_LOG_DIR): - ret = utils.mkdir(LPCC_CLEANUP_LOG_DIR) - if ret: - sys.stderr.write("failed to create directory [%s]" % LPCC_CLEANUP_LOG_DIR) - sys.exit(-1) - elif not os.path.isdir(LPCC_CLEANUP_LOG_DIR): - sys.stderr.write("[%s] is not a directory" % LPCC_CLEANUP_LOG_DIR) - sys.exit(-1) - - if not os.path.exists(workspace): - ret = utils.mkdir(workspace) - if ret: - sys.stderr.write("failed to create directory [%s]" % workspace) - sys.exit(-1) - elif not os.path.isdir(workspace): - sys.stderr.write("[%s] is not a directory" % workspace) - sys.exit(-1) - - log = clog.get_log(resultsdir=workspace) - log.cl_info("started LPCC cleanup, please check [%s] for " - "more log" % (workspace)) - ret = hsm_check.hsm_process(log, lustre_mnt, pcc_root, pcc_fid_detach) - sys.exit(ret) diff --git a/lipe/pylpcc/lpcc_test.py b/lipe/pylpcc/lpcc_test.py deleted file mode 100644 index 754ccda..0000000 --- a/lipe/pylpcc/lpcc_test.py +++ /dev/null @@ -1,1028 +0,0 @@ -# Copyright (c) 2017 DataDirect Networks, Inc. -# All Rights Reserved. -# Author: lixi@ddn.com - -""" -Library for testing LPCC(Lustre Persistent Client Cache) -""" -# pylint: disable=too-many-lines -import sys -import traceback -import signal -import os -import shutil -import filelock - -# Local libs -from pylpcc import lpcc -from pylustre import lustre -from pylustre import lustre_test -from pylustre import utils -from pylustre import time_util -from pylustre import daemon -from pylustre import clog - -LPCC_LOG_TEST_DIR = "/var/log/lpcc_test" -MANAGER = None -LPCC_TESTS = [] - - -def usage(): - """ - Print usage string - """ - utils.oprint("Usage: %s " % sys.argv[0]) - - -def check_file_size(log, host, fpath, expected_size): - """ - Check the file size - """ - command = ("stat --printf=%%s %s" % (fpath)) - retval = host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - size = int(retval.cr_stdout) - - if size != expected_size: - log.cl_error("wrong size of file [%s], expected [%s], got [%s]", - fpath, expected_size, size) - return -1 - return 0 - - -def check_lpcc_sizes(log, lpcc_host, lpcc_fpath, lustre_fpath, expected_size): - """ - Check the LPCC file sizes - """ - ret = check_file_size(log, lpcc_host, lpcc_fpath, expected_size) - if ret: - log.cl_error("wrong size of LPCC file") - return ret - ret = check_file_size(log, lpcc_host, lustre_fpath, expected_size) - if ret: - log.cl_error("wrong size of Lustre file") - return ret - return 0 - - -def check_file_data(log, host, fpath, expected_data): - """ - Check the file data - """ - # Read data before checking size since this might trigger HSM restore - command = ("cat %s" % (fpath)) - retval = host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - if retval.cr_stdout != expected_data: - log.cl_error("wrong data of file [%s], expected [%s], " - "got [%s]", fpath, expected_data, - retval.cr_stdout) - return -1 - - expected_size = len(expected_data) - ret = check_file_size(log, host, fpath, expected_size) - if ret: - log.cl_error("wrong size of file [%s]", fpath) - return -1 - return 0 - - -def check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, expected_data): - """ - Check the LPCC file data - """ - ret = check_file_data(log, lpcc_host, lpcc_fpath, expected_data) - if ret: - log.cl_error("wrong data of LPCC file") - return ret - - ret = check_file_data(log, lpcc_host, lustre_fpath, expected_data) - if ret: - log.cl_error("wrong data of Lustre file") - return ret - - return 0 - - -def check_multiop_exists(log, lpcc_rw_datasets): - """ - Check that all hosts has multiop command - """ - for lpcc_rw_dataset in lpcc_rw_datasets: - host = lpcc_rw_dataset.lpccd_host - ret = host.sh_file_executable(log, lustre_test.MULTIOP) - if ret: - log.cl_error("command [%s] is doesn't exist on host [%s]", - lustre_test.MULTIOP, host.sh_hostname) - return ret - return 0 - - -def lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath): - """ - Cleanup the test directory - """ - command = ("rm %s -f" % (lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - command = ("test -e %s" % (lustre_dir)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status == 0: - command = ("rmdir %s" % (lustre_dir)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - command = ("mkdir %s" % (lustre_dir)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - return 0 - - -def lpcc_rw_test(log, restore=False, project=True): - """ - Run LPCC readwrite tests - """ - # pylint: disable=too-many-return-statements,too-many-locals - # pylint: disable=too-many-statements,too-many-branches - # pylint: disable=no-self-use - lpcc_nclient = len(MANAGER.lm_rw_datasets) - if lpcc_nclient < 1: - log.cl_debug("not enough LPCC client") - return 1 - lpcc_rw_dataset = MANAGER.lm_rw_datasets[0] - lpcc_root = lpcc_rw_dataset.lpccd_root - lpcc_archive_id = lpcc_rw_dataset.lpccd_id - lpcc_host = lpcc_rw_dataset.lpccd_host - lustre_mnt = lpcc_rw_dataset.lpccd_lustre_mnt - - lustre_dirname = "dir" - lustre_fname = "file" - lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname)) - lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname)) - ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath) - if ret: - log.cl_error("failed to cleanup test file") - return -1 - - if project: - project_supported = lpcc_host.sh_chattr_has_projid_support(log) - if not project_supported: - log.cl_error("project is not supported by chattr, please upgrade " - "E2fsprogs to latest Lustre version") - return -1 - - command = ("chattr -p %d %s" % (lpcc_rw_dataset.lpccd_projid, lustre_dir)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - log.cl_error("project support might not be enabled, you might need " - "to run [tune2fs -O project $DEV] on all Lustre devices") - return -1 - - file_data = "fetch_origin" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - if not project: - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_NONE, pcc_state.ps_type) - - command = ("lfs pcc_fetch -a %s %s" % - (lpcc_archive_id, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_READWRITE: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_READWRITE, pcc_state.ps_type) - return -1 - - hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED | - lustre.HSMState.HS_RELEASED) - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - log.cl_error("failed to check HSM status after creating LPCC file [%s]", - lustre_fpath) - return ret - - fid_string = lustre.lfs_path2fid(log, lpcc_host, lustre_fpath) - if fid_string is None: - log.cl_error("failed to get fid from path [%s]", lustre_fpath) - return -1 - - lustre_fid = lustre.LustreFID(log, fid_string) - lpcc_fpath = lustre_fid.lf_posix_archive_path(lpcc_root) - command = ("ls -l %s" % (lpcc_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data) - if ret: - log.cl_error("wrong file data after creation") - return ret - - size = 7654321 - command = ("dd if=/dev/zero of=%s bs=%s count=1" % - (lustre_fpath, size)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - ret = check_lpcc_sizes(log, lpcc_host, lpcc_fpath, lustre_fpath, size) - if ret: - log.cl_error("wrong file size after wrote file") - return ret - - size = 1234567 - command = ("truncate -s %s %s" % (size, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - ret = check_lpcc_sizes(log, lpcc_host, lpcc_fpath, lustre_fpath, size) - if ret: - log.cl_error("wrong file size after truncated file") - return ret - - file_data = "file_data" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data) - if ret: - log.cl_error("wrong file data after written") - return ret - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_READWRITE: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_READWRITE, pcc_state.ps_type) - return -1 - - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - log.cl_error("failed to check HSM status after written LPCC file [%s]", - lustre_fpath) - return ret - - hsm_states = lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED - if lpcc_nclient < 2 or restore: - log.cl_debug("restoring the PCC file using command") - ret = lustre.lfs_hsm_restore(log, lustre_fpath, host=lpcc_host) - if ret: - log.cl_error("failed to restore file [%s]", lustre_fpath) - return ret - - ret = lustre.wait_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - log.cl_error("failed to wait status after restoring file [%s]", - lustre_fpath) - return ret - else: - log.cl_debug("accessing the data to trigger restoring of the PCC file") - for remote_client in MANAGER.lm_rw_datasets[1:]: - remote_host = remote_client.lpccd_host - remote_mnt = remote_client.lpccd_lustre_mnt - remote_dir = ("%s/%s" % (remote_mnt, lustre_dirname)) - remote_fpath = ("%s/%s" % (remote_dir, lustre_fname)) - ret = check_file_data(log, remote_host, remote_fpath, file_data) - if ret: - log.cl_error("wrong file data on the remote client [%s]", - remote_host.sh_hostname) - return ret - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_NONE, pcc_state.ps_type) - return -1 - - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - log.cl_error("failed to check HSM status after restoring file [%s]", - lustre_fpath) - return ret - - # The file has been restored, thus no LPCC cache now - file_data = "new_data" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_NONE, pcc_state.ps_type) - return -1 - - hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED | - lustre.HSMState.HS_DIRTY) - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - log.cl_error("failed to check HSM status after writing to restored file [%s]", - lustre_fpath) - return ret - - for dataset in MANAGER.lm_rw_datasets: - host = dataset.lpccd_host - ret = check_file_data(log, host, lustre_fpath, file_data) - if ret: - log.cl_error("wrong file data on the client [%s]", - host.sh_hostname) - return ret - return 0 - - -def test_lfs_pcc_fetch_restore(log): - """ - Test lfs pcc_fetch with HSM restore - """ - return lpcc_rw_test(log, restore=True, project=False) - - -LPCC_TESTS.append(test_lfs_pcc_fetch_restore) - - -def test_lfs_pcc_fetch_access(log): - """ - Test lfs pcc_fetch with remote access - """ - return lpcc_rw_test(log, restore=False, project=False) - - -LPCC_TESTS.append(test_lfs_pcc_fetch_access) - - -def test_project_restore(log): - """ - Test project ID with HSM restore - """ - return lpcc_rw_test(log, restore=True, project=True) - - -LPCC_TESTS.append(test_project_restore) - - -def test_project_access(log): - """ - Test project ID with remote access - """ - return lpcc_rw_test(log, restore=False, project=True) - - -LPCC_TESTS.append(test_project_access) - - -def test_multi_open_when_creating(log): - # pylint: disable=no-self-use,too-many-locals,too-many-return-statements - # pylint: disable=too-many-statements - """ - When a process created a LPCC file and holding the open, another - process on the same client should be able to open the file. - """ - lpcc_nclient = len(MANAGER.lm_rw_datasets) - if lpcc_nclient < 1: - log.cl_debug("not enough LPCC client") - return 1 - - if check_multiop_exists(log, MANAGER.lm_rw_datasets): - log.cl_debug("multiop command doesn't exist") - return 1 - - lpcc_rw_dataset = MANAGER.lm_rw_datasets[0] - lpcc_root = lpcc_rw_dataset.lpccd_root - lpcc_archive_id = lpcc_rw_dataset.lpccd_id - lpcc_host = lpcc_rw_dataset.lpccd_host - - lustre_mnt = lpcc_rw_dataset.lpccd_lustre_mnt - lustre_dirname = "dir" - lustre_fname = "file" - lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname)) - lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname)) - ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath) - if ret != 0: - log.cl_error("failed to cleanup test file") - return -1 - - command = ("chattr -p %d %s" % (lpcc_rw_dataset.lpccd_projid, lustre_dir)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status != 0: - log.cl_error("failed to run command [%s]", command) - return -1 - - multiop = lustre_test.Multiop(lpcc_host, lustre_fpath, "vO_c", - "/tmp/multiop.stdout", - "/tmp/multiop.stderr") - multiop.mop_start(log) - ret = multiop.mop_wait_pausing(log) - if ret: - return ret - - hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED | - lustre.HSMState.HS_RELEASED) - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - return ret - - file_data = "multiopen_data" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status != 0: - log.cl_error("failed to run command [%s]", command) - return -1 - - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret: - return ret - - fid_string = lustre.lfs_path2fid(log, lpcc_host, lustre_fpath) - if fid_string is None: - log.cl_error("failed to get fid from path [%s]", lustre_fpath) - return -1 - - lustre_fid = lustre.LustreFID(log, fid_string) - lpcc_fpath = lustre_fid.lf_posix_archive_path(lpcc_root) - command = ("ls -l %s" % (lpcc_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status != 0: - log.cl_error("failed to run command [%s]", command) - return -1 - - ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data) - if ret != 0: - log.cl_error("failed to check lpcc data") - return -1 - - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret != 0: - log.cl_error("failed to check hsm state") - return -1 - - multiop.mop_pkill(log) - return 0 - - -LPCC_TESTS.append(test_multi_open_when_creating) - - -def test_remote_local_open(log): - # pylint: disable=no-self-use,too-many-locals,too-many-return-statements - # pylint: disable=too-many-statements - """ - When a process created a LPCC file and holding the open, another - process on the different client should not be able to open the file. - """ - lpcc_nclient = len(MANAGER.lm_rw_datasets) - if lpcc_nclient < 2: - log.cl_debug("not enough LPCC client") - return 1 - - if check_multiop_exists(log, MANAGER.lm_rw_datasets): - log.cl_debug("multiop command doesn't exist") - return 1 - - lpcc_rw_dataset = MANAGER.lm_rw_datasets[0] - lpcc_archive_id = lpcc_rw_dataset.lpccd_id - lpcc_host = lpcc_rw_dataset.lpccd_host - - lustre_mnt = lpcc_rw_dataset.lpccd_lustre_mnt - lustre_dirname = "dir" - lustre_fname = "file" - lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname)) - lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname)) - ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath) - if ret != 0: - log.cl_error("failed to cleanup test file") - return -1 - - command = ("chattr -p %d %s" % (lpcc_rw_dataset.lpccd_projid, lustre_dir)) - retval = lpcc_host.sh_run(log, command) - if ret != 0: - log.cl_error("failed to run command [%s]", command) - return -1 - - multiop = lustre_test.Multiop(lpcc_host, lustre_fpath, "vO_c", - "/tmp/multiop.stdout", - "/tmp/multiop.stderr") - multiop.mop_start(log) - ret = multiop.mop_wait_pausing(log) - if ret != 0: - log.cl_error("failed to wait multiop") - return -1 - - hsm_states = (lustre.HSMState.HS_EXISTS | lustre.HSMState.HS_ARCHIVED | - lustre.HSMState.HS_RELEASED) - ret = lustre.check_hsm_state(log, lustre_fpath, hsm_states, - archive_id=lpcc_archive_id, host=lpcc_host) - if ret != 0: - log.cl_error("failed to check HSM state") - return -1 - - remote_client = MANAGER.lm_rw_datasets[1] - remote_host = remote_client.lpccd_host - - command = ("cat %s" % (lustre_fpath)) - retval = remote_host.sh_run(log, command) - if retval.cr_exit_status == 0: - log.cl_error("command [%s] succeeded unexpectedly", command) - return -1 - - file_data = "multiopen_data" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = remote_host.sh_run(log, command) - if retval.cr_exit_status == 0: - log.cl_error("command [%s] succeeded unexpectedly", command) - return -1 - - multiop.mop_signal(log) - multiop.mop_wait_exit(log) - - command = ("cat %s" % (lustre_fpath)) - retval = remote_host.sh_run(log, command) - if retval.cr_exit_status != 0: - log.cl_error("command [%s] failed unexpectedly", command) - return -1 - - file_data = "multiopen_data" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = remote_host.sh_run(log, command) - if retval.cr_exit_status != 0: - log.cl_error("command [%s] failed unexpectedly", command) - return -1 - return 0 - - -LPCC_TESTS.append(test_remote_local_open) - - -def lpcc_ro_test(log): - # pylint: disable=too-many-locals,no-self-use - # pylint: disable=too-many-statements,too-many-branches - # pylint: disable=too-many-return-statements,unused-variable - """ - Run LPCC readonly tests - """ - dataset_number = len(MANAGER.lm_ro_datasets) - if dataset_number < 1: - log.cl_info("not enough LPCC readonly dataset") - return 1 - lpcc_ro_dataset = MANAGER.lm_ro_datasets[0] - lpcc_client = lpcc_ro_dataset.lpccd_lustre_client - lpcc_root = lpcc_ro_dataset.lpccd_root - lpcc_group_id = lpcc_ro_dataset.lpccd_id - group_datasets = MANAGER.lm_ro_dataset_groups[lpcc_group_id] - group_clients = [] - for group_dataset in group_datasets: - lustre_client = group_dataset.lpccd_lustre_mnt - if lustre_client not in group_clients: - group_clients.append(lustre_client) - - none_group_clients = [] - for client_id, client in MANAGER.lm_lustre_clients.iteritems(): - if client not in group_clients: - none_group_clients.append(lustre_client) - - lpcc_host = lpcc_ro_dataset.lpccd_host - lustre_mnt = lpcc_ro_dataset.lpccd_lustre_mnt - - lustre_dirname = "dir_ro" - lustre_fname = "file" - lustre_dir = ("%s/%s" % (lustre_mnt, lustre_dirname)) - lustre_fpath = ("%s/%s" % (lustre_dir, lustre_fname)) - ret = lpcc_cleanup_test_file(log, lpcc_host, lustre_dir, lustre_fpath) - if ret: - log.cl_error("failed to cleanup test file") - return -1 - - file_data = "fetch_origin" - command = ("echo -n %s > %s" % (file_data, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_NONE, pcc_state.ps_type) - return -1 - - command = ("lfs pcc_fetch -r -a %s %s" % (lpcc_group_id, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_READONLY: - log.cl_error("wrong PCC type, expected [%s], got [%s]", - lpcc.LPCC_TYPE_READONLY, pcc_state.ps_type) - - fid_string = lustre.lfs_path2fid(log, lpcc_host, lustre_fpath) - if fid_string is None: - return -1 - - lustre_fid = lustre.LustreFID(log, fid_string) - lpcc_fpath = lustre_fid.lf_posix_archive_path(lpcc_root) - command = ("ls -l %s" % (lpcc_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data) - if ret: - log.cl_error("wrong file data after creation") - return ret - - for client_id, client in MANAGER.lm_lustre_clients.iteritems(): - if client == lpcc_client: - continue - host = client.lc_host - remote_lustre_fpath = ("%s/%s/%s" % - (client.lc_mnt, lustre_dirname, - lustre_fname)) - - # File read without grouplock should be blocked - multiop = lustre_test.Multiop(host, remote_lustre_fpath, "vor10", - "/tmp/multiop.stdout", - "/tmp/multiop.stderr") - multiop.mop_start(log) - ret = multiop.mop_wait_exit(log, timeout=3, quiet=True) - if ret == 0: - log.cl_error("file read on host [%s] is not blocked by group " - "lock on host [%s]", host.sh_hostname, - lpcc_host.sh_hostname) - return -1 - - multiop.mop_pkill(log) - ret = multiop.mop_wait_exit(log) - if ret: - log.cl_error("file read on host [%s] is not canceled", - host.sh_hostname) - return -1 - - # Not able to truncate file - command = ("truncate -s 0 %s" % (lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status == 0: - log.cl_error("command [%s] succeeded on host [%s], which is unexpected", - command, lpcc_host.sh_hostname) - return -1 - - command = ("truncate -s 1048576 %s" % (lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status == 0: - log.cl_error("command [%s] succeeded on host [%s], which is unexpected", - command, lpcc_host.sh_hostname) - return -1 - - # Not able to write data to readonly cache - command = ("echo -n not_written > %s" % (lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status == 0: - log.cl_error("command [%s] succeeded on host [%s], which is unexpected", - command, lpcc_host.sh_hostname) - return -1 - - # Not able to append to readonly cache - command = ("echo -n not_written >> %s" % (lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status == 0: - log.cl_error("command [%s] succeeded on host [%s], which is unexpected", - command, lpcc_host.sh_hostname) - return -1 - - # Check data again in case any data is changed - ret = check_lpcc_data(log, lpcc_host, lpcc_fpath, lustre_fpath, file_data) - if ret: - log.cl_error("wrong file data after truncate and write failures") - return ret - - # Detch and re-attach - command = ("lfs pcc_detach %s" % (lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_NONE: - log.cl_error("wrong PCC type after detach, expected [%s], got [%s]", - lpcc.LPCC_TYPE_NONE, pcc_state.ps_type) - return -1 - - command = ("lfs pcc_fetch -r -a %s %s" % (lpcc_group_id, lustre_fpath)) - retval = lpcc_host.sh_run(log, command) - if retval.cr_exit_status: - log.cl_error("failed to run command [%s] on host [%s], " - "ret = [%d], stdout = [%s], stderr = [%s]", - command, - lpcc_host.sh_hostname, - retval.cr_exit_status, - retval.cr_stdout, - retval.cr_stderr) - return -1 - - pcc_state = lpcc.lfs_pcc_state(log, lustre_fpath, host=lpcc_host) - if pcc_state.ps_type != lpcc.LPCC_TYPE_READONLY: - log.cl_error("wrong PCC type after re-prefetch, expected [%s], got [%s]", - lpcc.LPCC_TYPE_READONLY, pcc_state.ps_type) - return -1 - return 0 - - -def test_readonly(log): - """ - Test readonly PCC - """ - return lpcc_ro_test(log) - - -LPCC_TESTS.append(test_readonly) - - -def test_lpcc_locked(log, workspace, config_fpath): - """ - Start to run LPCC tests holding the confiure lock - """ - # pylint: disable=global-statement,too-many-branches - lpcc_workspace = workspace + "/lpcc" - - ret = utils.mkdir(lpcc_workspace) - if ret: - log.cl_error("failed to creat directory [%s]", lpcc_workspace) - return ret - global MANAGER - MANAGER = lpcc.LPCCManager(lpcc_workspace, config_fpath) - ret = MANAGER.lm_parse(log) - if ret: - return ret - - ret = MANAGER.lm_start(log) - if ret: - return ret - - quit_on_error = True - only_test = None - passed_tests = [] - failed_tests = [] - skipped_tests = [] - for lpcc_test in LPCC_TESTS: - if only_test is not None and only_test != lpcc_test.__name__: - continue - log.cl_info("test [%s] started", lpcc_test.__name__) - ret = lpcc_test(log) - if ret < 0: - log.cl_error("test [%s] failed", lpcc_test.__name__) - failed_tests.append(lpcc_test) - if quit_on_error: - return -1 - elif ret == 1: - log.cl_warning("test [%s] skipped", lpcc_test.__name__) - skipped_tests.append(lpcc_test) - else: - log.cl_info("test [%s] passed", lpcc_test.__name__) - passed_tests.append(lpcc_test) - - if len(skipped_tests) != 0: - for skipped_test in skipped_tests: - log.cl_warning("test [%s] skipped", skipped_test.__name__) - - if len(failed_tests) != 0: - for failed_test in failed_tests: - log.cl_error("test [%s] failed", failed_test.__name__) - - if len(passed_tests) != 0: - for passed_test in passed_tests: - log.cl_info("test [%s] passed", passed_test.__name__) - - daemon.SHUTTING_DOWN = True - ret = MANAGER.lm_stop(log) - if ret: - log.cl_error("failed to stop lpcc manager") - - if len(failed_tests) != 0 or ret: - return -1 - return 0 - - -def test_lpcc(log, workspace, config_fpath): - """ - Start to run LPCC tests - """ - # pylint: disable=bare-except - lock_file = config_fpath + ".lock" - lock = filelock.FileLock(lock_file) - try: - with lock.acquire(timeout=0): - try: - ret = test_lpcc_locked(log, workspace, config_fpath) - except: - ret = -1 - log.cl_error("exception: %s", traceback.format_exc()) - lock.release() - except filelock.Timeout: - ret = -1 - log.cl_error("someone else is holding lock of file [%s], aborting " - "to prevent conflicts", lock_file) - return ret - - -def main(): - """ - Run LPCC tests - """ - # pylint: disable=unused-variable,not-callable - if sys.version[0] == '2': - reload(sys) - if hasattr(sys, "setdefaultencoding"): - set_encoding = getattr(sys, "setdefaultencoding", None) - set_encoding('UTF-8') - else: - os.environ["PYTHONIOENCODING"] = 'UTF-8' - - config_fpath = lpcc.LPCC_CONFIG - - if len(sys.argv) == 2: - config_fpath = sys.argv[1] - elif len(sys.argv) > 2: - usage() - sys.exit(-1) - - identity = time_util.local_strftime(time_util.utcnow(), "%Y-%m-%d-%H_%M_%S") - workspace = LPCC_LOG_TEST_DIR + "/" + identity - - if not os.path.exists(LPCC_LOG_TEST_DIR): - ret = utils.mkdir(LPCC_LOG_TEST_DIR) - if ret: - utils.eprint("failed to create directory [%s] on local host" % LPCC_LOG_TEST_DIR) - sys.exit(-1) - elif not os.path.isdir(LPCC_LOG_TEST_DIR): - utils.eprint("[%s] is not a directory" % LPCC_LOG_TEST_DIR) - sys.exit(-1) - - if not os.path.exists(workspace): - ret = utils.mkdir(workspace) - if ret: - utils.eprint("failed to create directory [%s] on local host" % workspace) - sys.exit(-1) - elif not os.path.isdir(workspace): - utils.eprint("[%s] is not a directory" % workspace) - sys.exit(-1) - - signal.signal(signal.SIGINT, daemon.signal_handler) - signal.signal(signal.SIGTERM, daemon.signal_handler) - log = clog.get_log(resultsdir=workspace) - log.cl_info("started LPCC test using config [%s], please check [%s] for " - "more log" % (config_fpath, workspace)) - - save_fpath = workspace + "/" + lpcc.LPCC_CONFIG_FNAME - log.cl_debug("copying config file from [%s] to [%s]", config_fpath, - save_fpath) - shutil.copyfile(config_fpath, save_fpath) - ret = test_lpcc(log, workspace, config_fpath) - if ret: - log.cl_error("test failed, please check [%s] for more log", workspace) - sys.exit(ret) - log.cl_info("all tests passed, please check [%s] for more log", workspace) - sys.exit(0) diff --git a/lipe/src/lpcc_purge.c b/lipe/src/lpcc_purge.c index b8f4e24..f07d594 100644 --- a/lipe/src/lpcc_purge.c +++ b/lipe/src/lpcc_purge.c @@ -42,6 +42,7 @@ #define OPT_CLEAR_HASHDIR 3 #define OPT_LOG_LEVEL 4 #define OPT_MAX_SCAN_SECS 5 +#define OPT_PIDFILE 6 struct lpcc_purge_options { char *o_cache; @@ -57,6 +58,7 @@ struct lpcc_purge_options { int o_max_scan_secs; char *o_dumpfile; + char *o_pidfile; bool o_dry_run; bool o_clear_hashdir; }; @@ -299,7 +301,8 @@ static void usage(void) "\t-t, --scan-threads=NUM scanning threads (default: %u)\n" "\t --candidate-num=NUM, candidate number of approximate LRU (default: %d, min: %d, max: %d)\n" "\t --max-scan-secs, max seconds to scan continously before purging (default: %d, min: %d, max: %d)\n" - "\t-w, --dump=FILE, dump stats to FILE when signal USR1 is recieved (default: /var/run/lpcc_purge-PID.stats)\n" + "\t-w, --dump=FILE, dump stats to FILE when signal USR1 is recieved (default: /var/run/lpcc_purge-RWID.stats)\n" + "\t --pidfile=FILE, the pidfile name (default: /var/run/lpcc_purge-RWID.pid)\n" "\t --clear-hashdir, clear empty hash dir after detaching file\n" "\t --dry-run, scan once but do not detach file really\n" "\t-h, --help, print this help message\n", @@ -329,6 +332,7 @@ static struct option long_options[] = { { "dry-run", no_argument, NULL, OPT_DRY_RUN}, { "candidate-num", required_argument, NULL, OPT_CANDIDATE_NUM}, { "dump", required_argument, NULL, 'w'}, + { "pidfile", required_argument, NULL, OPT_PIDFILE}, { "clear-hashdir", no_argument, NULL, OPT_CLEAR_HASHDIR}, { "max-scan-secs", required_argument, NULL, OPT_MAX_SCAN_SECS}, { "help", no_argument, NULL, 'h' }, @@ -570,6 +574,9 @@ static void lpcc_purge_process_opt(int c, char *optarg) case 'w': opt.o_dumpfile = strdup(optarg); break; + case OPT_PIDFILE: + opt.o_pidfile = strdup(optarg); + break; case OPT_DRY_RUN: opt.o_dry_run = true; break; @@ -655,6 +662,10 @@ void lpcc_purge_verify_opts(void) snprintf(buf, sizeof(buf), "/var/run/lpcc_purge-%d.stats", opt.o_rwid); opt.o_dumpfile = strdup(buf); } + if (opt.o_pidfile == NULL) { + snprintf(buf, sizeof(buf), "/var/run/lpcc_purge-%d.pid", opt.o_rwid); + opt.o_pidfile = strdup(buf); + } /* check freehi > freelo */ if (opt.o_high_usage <= opt.o_low_usage) { @@ -719,14 +730,12 @@ out: static void lpcc_purge_lock_pidfile(void) { - char buf[PATH_MAX]; int fd; - snprintf(buf, sizeof(buf), "/var/run/lpcc_purge-%d.pid", opt.o_rwid); - fd = create_pid_file(buf); + fd = create_pid_file(opt.o_pidfile); if (fd < 0) { llapi_error(LLAPI_MSG_FATAL, errno, - "cannot create pidfile '%s'", buf); + "cannot create pidfile '%s'", opt.o_pidfile); exit(1); } /* we keep the fd open to hold the flock, @@ -856,11 +865,6 @@ static int lpcc_purge_detach_candidate(const char *mnt, llapi_printf(LLAPI_MSG_DEBUG, "detach fid: "DFID"\n", PFID(&candidate->c_fid)); - pthread_mutex_lock(&stats.s_lock); - stats.s_purged_objs++; - stats.s_total_purged_objs++; - pthread_mutex_unlock(&stats.s_lock); - /* double confirm the atime. If it's changed, discard this entry */ rc = stat(candidate->c_path, &statbuf); if (rc) { diff --git a/lipe/systemd/lpcc.service b/lipe/systemd/lpcc.service index d2ca574..1d7a6c8 100644 --- a/lipe/systemd/lpcc.service +++ b/lipe/systemd/lpcc.service @@ -1,10 +1,16 @@ [Unit] -Description=Lustre Persistent Client Cache +Description=Lustre Persistent Client Cache Management + +Requires=network-online.target +After=network-online.target + +ConditionPathExists=/etc/lpcc.conf [Service] Type=simple -ExecStart=/usr/bin/lpcc -User=root +ExecStart=lpcc monitor +ExecStop=kill $MAINPID [Install] WantedBy=multi-user.target + diff --git a/lustre/doc/Makefile.am b/lustre/doc/Makefile.am index 436ec1f..bec88b6 100644 --- a/lustre/doc/Makefile.am +++ b/lustre/doc/Makefile.am @@ -90,8 +90,8 @@ MANFILES = \ lustre_rsync.8 \ nids.5 \ plot-llstat.8 \ - routerstat.8 - + routerstat.8 \ + umount.lustre.8 LIBMAN = \ lustreapi.7 \ diff --git a/lustre/doc/umount.lustre.8 b/lustre/doc/umount.lustre.8 new file mode 100644 index 0000000..053e554 --- /dev/null +++ b/lustre/doc/umount.lustre.8 @@ -0,0 +1,75 @@ +.\"@(#)umount.lustre.8" +.TH UMOUNT.LUSTRE 8 "7 Jul 2021" +.SH NAME +umount.lustre \- unmount a Lustre File System +.SH SYNOPSIS +.BI "umount.lustre" " dir" " [\-fvnrldh ]" +.SH DESCRIPTION +.BR umount.lustre +is a part of +.BR lustre (7) +utilities package, which provides Lustre client functionality. + +.BR umount.lustre +stops any Lustre Persistent Client Cache (LPCC) running on the Lustre file +system to be unmounted, then do real umounting. + +.BR umount.lustre +are meant to be used by the +.BR umount (8) +command for unmounting Lustre file system. This subcommand, however, can also +be used as a standalone command with limited functionality. + +.I dir +is the directory on which the file system is mounted. + +.SH OPTIONS +.TP +.BI "\-f" +Force unmount the file system in case of unreachable Lustre file system. +.TP +.BI "\-v" +Be verbose. +.TP +.BI "\-n" +Do not update +.I /etc/mtab. +By default, an entry is created in +.I /etc/mtab +for every mounted file system. Use this option to skip deleting an entry. +.TP +.BI "\-r" +In case unmounting fails, try to mount read-only. +.TP +.BI "\-l" +Lazy unmount. Detach the file system from the file system hierarchy now, and +cleanup all references to the file system as soon as it is not busy anymore. +.TP +.BI "\-d" +When the unmounted device was a loop device, also free this loop device. +.TP +.BI "\-h" +Print help message. + +.SH NOTE +For further information please refer +.BR lustre (5) +and +.BR umount (8) +manual pages. + +.SH FILES +.TP 18n +.I /etc/fstab +file system table +.TP +.I /etc/mtab +table of mounted file systems + +.PD +.SH "SEE ALSO" +.BR lustre (7), +.BR umount (8), + +.SH "AUTHOR" +Lei Feng diff --git a/lustre/scripts/Makefile.am b/lustre/scripts/Makefile.am index 7a3232b..e85d8f2 100644 --- a/lustre/scripts/Makefile.am +++ b/lustre/scripts/Makefile.am @@ -37,7 +37,7 @@ genscripts = lc_modprobe lc_net lc_hb lc_cluman lc_md lc_lvm lustre_start lnet SUBDIRS = systemd -sbin_SCRIPTS = lustre_rmmod ko2iblnd-probe +sbin_SCRIPTS = lustre_rmmod ko2iblnd-probe umount.lustre if RHEL initdir = $(sysconfdir)/init.d @@ -93,7 +93,8 @@ EXTRA_DIST = license-status lustre_rmmod ldev lc_mon lhbadm \ $(addsuffix .in,$(genscripts)) lfs_migrate lustre_req_history \ lustre lsvcgss lc_common haconfig Lustre.ha_v2 dkms.mkconf \ zfsobj2fid ko2iblnd-probe statechange-lustre.sh \ - bash-completion/lustre bash-completion/lctl bash-completion/lfs + bash-completion/lustre bash-completion/lctl bash-completion/lfs \ + umount.lustre CLEANFILES = $(genscripts) diff --git a/lustre/scripts/umount.lustre b/lustre/scripts/umount.lustre new file mode 100755 index 0000000..3097da0 --- /dev/null +++ b/lustre/scripts/umount.lustre @@ -0,0 +1,54 @@ +#!/bin/bash + +set -eu + + +usage() +{ + echo "usage: umount.lustre dir [-fvnrlh]" + echo "options:" + echo -e "\t-f\tforce umount" + echo -e "\t-v\tverbose" + echo -e "\t-n\tDo not update /etc/mtab" + echo -e "\t-r\tremount" + echo -e "\t-l\tlazy umount" + echo -e "\t-h\tprint this help" +} + +declare -a args=("$@") + +temp=`getopt -o fvnrlh -- "$@"` +if [[ $? != 0 ]]; then + usage + exit 1 +fi + +eval set -- "$temp" + +while true; do + case "$1" in + -h) + usage + exit 0 + ;; + -f|-v|-n|-r|-l) + shift + ;; + --) + shift + mount_point="$1" + break + ;; + *) + usage + exit 1 + ;; + esac +done + +if [[ -x /usr/bin/lpcc ]] && [[ -S /var/run/lpcc.sock ]]; then + /usr/bin/lpcc stop "$mount_point" --keep-enabled > /dev/null +fi + +umount --internal-only "${args[@]}" + diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 6384027..ea73190 100755 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -15183,10 +15183,10 @@ test_160h() { "R" 20 || error "$i: GC-thread not found in R-state" # check umounts of each MDT on MDS have reached kthread_stop() - [[ $(do_node $i pgrep umount | wc -l) -eq $nb ]] || + [[ $(do_node $i pgrep umount.lustre | wc -l) -eq $nb ]] || error "$i: expected $nb umount" wait_update $i \ - "ps -C umount -o state --no-headers | uniq" "D" 20 || + "ps -C umount -o state --no-headers | grep D | wc -l" "$nb" 20 || error "$i: umount not found in D-state" done