LU-17352 utils: lljobstat can read dumped stats files

author Lei Feng <flei@whamcloud.com>

Sun, 10 Dec 2023 08:45:38 +0000 (16:45 +0800)

committer Andreas Dilger <adilger@whamcloud.com>

Sat, 6 Jan 2024 08:21:52 +0000 (08:21 +0000)
author Lei Feng <flei@whamcloud.com>
Sun, 10 Dec 2023 08:45:38 +0000 (16:45 +0800)
committer Andreas Dilger <adilger@whamcloud.com>
Sat, 6 Jan 2024 08:21:52 +0000 (08:21 +0000)
diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh

index fdaf1ef..d721441 100755 (executable)
--- a/lustre/tests/sanity.sh
+++ b/lustre/tests/sanity.sh
@@ -29841,6 +29841,24 @@ test_823() {
  }
  run_test 823 "Setting create_count > OST_MAX_PRECREATE is lowered to maximum"
  
+test_850() {
+       local dir=$DIR/$tdir
+       local file=$dir/$tfile
+       local statsfile=$dir/all_job_stats.txt
+
+       test_mkdir -p $dir || error "failed to create dir $dir"
+       echo "abcdefg" > $file || error "failed to create file $file"
+
+       # read job_stats in the living system
+       lljobstat -n 1 ||
+               error "failed to run lljobstat on living system"
+
+       $LCTL get_param *.*.job_stats > $statsfile
+       lljobstat --statsfile=$statsfile ||
+               error "failed to run lljobstat on file $statsfile"
+}
+run_test 850 "lljobstat can parse living and aggregated job_stats"
+
  #
  # tests that do cleanup/setup should be run at the end
  #
@@ -30061,6 +30079,7 @@ test_907() {
  }
  run_test 907 "write rpc error during unlink"
  
+
  complete_test $SECONDS
  [ -f $EXT2_DEV ] && rm $EXT2_DEV || true
  check_and_cleanup_lustre
diff --git a/lustre/utils/lljobstat b/lustre/utils/lljobstat

index fdb9736..6d9dab8 100755 (executable)
--- a/lustre/utils/lljobstat
+++ b/lustre/utils/lljobstat
@@ -7,18 +7,19 @@ job, show top jobs
  
  import argparse
  import errno
+import signal
  import subprocess
  import sys
  import time
  import yaml
-import signal
-
  try:
      from yaml import CLoader as Loader
  except ImportError:
      from yaml import Loader
  
-signal.signal(signal.SIGINT, signal.default_int_handler)
+
+def exit_silently(signal, frame):
+    sys.exit(0)
  
  class ArgParser: # pylint: disable=too-few-public-methods
      '''
@@ -33,7 +34,8 @@ class ArgParser: # pylint: disable=too-few-public-methods
          define and parse arguments
          '''
          parser = argparse.ArgumentParser(prog='lljobstat',
-                                         description='List top jobs.')
+                                         description='List top jobs.',
+                                         formatter_class=argparse.RawTextHelpFormatter)
          parser.add_argument('-c', '--count', type=int, default=5,
                              help='the number of top jobs to be listed (default 5).')
          parser.add_argument('-i', '--interval', type=int, default=10,
@@ -53,6 +55,15 @@ class ArgParser: # pylint: disable=too-few-public-methods
          parser.add_argument('--no-fullname', dest='fullname',
                              action='store_false',
                              help='show abbreviated operations name.')
+        parser.add_argument('--statsfile', type=str, action='append',
+                            help="Parse specified file instead of job_stats files in system.\n"
+                                 "Usually the file is generate by command:\n"
+                                 "# lctl get_param *.*.job_stats > job_stats.txt\n"
+                                 "Multiple files can be specified by specify --statsfile for\n"
+                                 "multiple times. For example:\n"
+                                 "# lljobstats --statsfile=1.txt --statsfile=2.txt\n"
+                                 "The stats data of jobs from multiple statsfiles will be\n"
+                                 "added up and sorted. Then top jobs are listed.\n")
  
          self.args = parser.parse_args()
  
@@ -105,18 +116,26 @@ class JobStatsParser:
              if err.returncode == errno.ENOENT:
                  return []
  
-    def parse_single_job_stats(self, param): # pylint: disable=no-self-use
+    def parse_single_job_stats(self, param=None, string=None): # pylint: disable=no-self-use
          '''
-        read single job_stats file, parse it and return an object
+        read single job_stats file or a string, parse it and return an object
          '''
-        cmd = ['lctl', 'get_param', '-n', param]
-        out = subprocess.check_output(cmd).decode()
+        out = ""
+        if string is not None:
+            out = string
+        else:
+            cmd = ['lctl', 'get_param', '-n', param]
+            out = subprocess.check_output(cmd).decode()
          output = out.replace('job_id:          @', 'job_id:          .')
+
          try:
              yaml_obj = yaml.load(output, Loader=Loader)  # need several seconds...
          except yaml.scanner.ScannerError:
              # only print the file name here
-            print("failed to parse the content of %s" % param, file=sys.stdout)
+            if param != None:
+                print("failed to parse the content of %s" % param, file=sys.stdout)
+            if string != None:
+                print("failed to parse the string", file=sys.stdout)
              raise
  
          return yaml_obj
@@ -171,7 +190,7 @@ class JobStatsParser:
          '''
          print single job
          '''
-        print('- %-16s {' % (job['job_id'] + ':'), end='')
+        print('- %-16s {' % (str(job['job_id']) + ':'), end='')
          first = True
          for key, val in self.op_keys.items():
              if not val in job.keys():
@@ -199,18 +218,58 @@ class JobStatsParser:
              self.print_job(job)
          print('...') # mark the end of YAML doc in stream
  
+    def parse_file(self, file, jobs):
+        '''
+        parse a single file. the file may be a combination of several job_stats
+        '''
+        job_stats_str = ""
+        started = False
+        with open(file) as file_handler:
+            for line in file_handler:
+                # start from "job_stats:"
+                if not started:
+                    if line[0:10] == "job_stats:":
+                        started = True
+                        job_stats_str += line
+                    continue
+
+                if line == "\n" or line[0:2] == "  " or line[0:2] == "- ":
+                    # for empty line, "  ", or "- ", append current line
+                    job_stats_str += line
+                else:
+                    # end current segment
+                    obj = self.parse_single_job_stats(string=job_stats_str)
+                    if obj != None and obj['job_stats'] != None:
+                        for job in obj['job_stats']:
+                            self.merge_job(jobs, job)
+
+                    if line[0:10] == "job_stats:":
+                        # start of next segment
+                        job_stats_str = line
+                    else:
+                        job_stats_str = ""
+                        started = False
+            else:
+                obj = self.parse_single_job_stats(string=job_stats_str)
+                if obj != None and obj['job_stats'] != None:
+                    for job in obj['job_stats']:
+                        self.merge_job(jobs, job)
+
      def run_once(self):
          '''
          scan/parse/aggregate/print top jobs in given job_stats pattern/path(s)
          '''
          jobs = {}
-        for param in self.list_param(self.args.param):
-            obj = self.parse_single_job_stats(param)
-            if obj['job_stats'] is None:
-                continue
  
-            for job in obj['job_stats']:
-                self.merge_job(jobs, job)
+        if self.args.statsfile:
+            for file in self.args.statsfile:
+                self.parse_file(file, jobs)
+        else:
+            for param in self.list_param(self.args.param):
+                obj = self.parse_single_job_stats(param=param)
+                if obj is not None and obj['job_stats'] is not None:
+                    for job in obj['job_stats']:
+                        self.merge_job(jobs, job)
  
          top_jobs = self.pick_top_jobs(jobs, self.args.count)
          self.print_top_jobs(top_jobs)
@@ -235,6 +294,13 @@ class JobStatsParser:
          argparser.run()
          self.args = argparser.args
  
+        if self.args.statsfile:
+            self.run_once()
+            return
+
+        # exit silently if Ctrl+C is pressed in the loop below
+        signal.signal(signal.SIGINT, exit_silently)
+
          i = 0
          try:
              while True:
author	Lei Feng <flei@whamcloud.com>
	Sun, 10 Dec 2023 08:45:38 +0000 (16:45 +0800)
committer	Andreas Dilger <adilger@whamcloud.com>
	Sat, 6 Jan 2024 08:21:52 +0000 (08:21 +0000)
lustre/tests/sanity.sh		patch \| blob \| history
lustre/utils/lljobstat		patch \| blob \| history