Whamcloud - gitweb
EX-5419 lipe: retry to get lpcc_purge stats file
authorLei Feng <flei@whamcloud.com>
Fri, 24 Jun 2022 02:34:31 +0000 (10:34 +0800)
committerAndreas Dilger <adilger@whamcloud.com>
Wed, 29 Jun 2022 19:10:18 +0000 (19:10 +0000)
Sleep and retry to get lpcc_purge stats file after sending signal
Correct statistics data algorithm.

Signed-off-by: Lei Feng <flei@whamcloud.com>
Test-Parameters: clientdistro=el8.5 testlist=sanity-pcc env=ONLY=210
Change-Id: I79e4b7de871f8ffd290e0c30da4780265fa7e9fa
Reviewed-on: https://review.whamcloud.com/47729
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Reviewed-by: Andreas Dilger <adilger@whamcloud.com>
lipe/lpcc
lustre/tests/sanity-pcc.sh

index 1200793..a706d41 100755 (executable)
--- a/lipe/lpcc
+++ b/lipe/lpcc
@@ -154,24 +154,40 @@ class LpccService:
         self._kill_process_by_pidfile(self.lpcc_purge_prog, pidfile)
 
     def _stats_lpcc_purge(self):
+        statsfile = '/var/run/lpcc_purge-%d.stats' % self.lpcc_roid
+        try:
+            os.unlink(statsfile)
+        except FileNotFoundError:
+            pass
+
         pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid
         cmdline = ['pkill', '--signal', 'USR1', '--pidfile', pidfile, '--', 'lpcc_purge']
         subprocess.call(cmdline)
 
-        statsfile = '/var/run/lpcc_purge-%d.stats' % self.lpcc_roid
         stats_obj = None
-        try:
-            with open(statsfile) as file_handler:
-                stats_obj = json.load(file_handler)
-        except Exception as err:
-            eprint(err)
+        retry_times = 5
+        for i in range(0, retry_times + 1):
+            try:
+                with open(statsfile) as file_handler:
+                    stats_obj = json.load(file_handler)
+            except FileNotFoundError:
+                if (i == retry_times):
+                    eprint("cannot get stat file '%s' after trying for %d times" %
+                           (statsfile, retry_times + 1))
+                    break
+                delay = 0.1 * pow(2, i)
+                time.sleep(delay)
+            except Exception as err:
+                eprint(err)
+                break
 
         return stats_obj
 
     def _dump_config(self):
-        eprint("========== Config ==========")
-        yaml.safe_dump(self.lpcc_config, sys.stdout, default_flow_style=False)
-        eprint("============================")
+        eprint("========== Single PCC Config ==========")
+        yaml.safe_dump(self.lpcc_config, stream=sys.stderr, default_flow_style=False)
+        eprint("=======================================")
+        sys.stderr.flush()
 
     def start(self):
         """
@@ -409,7 +425,7 @@ class LpccMonitor:
                 result['read_bytes'] = int(words[6])
 
         # adjust open times and calc hit ratio
-        real_hit = result.get('pcc_hit', 0) - result.get('pcc_attach', 0);
+        real_hit = result.get('pcc_hit', 0)
         result['pcc_real_hit'] = real_hit
         result.pop('pcc_hit', None)
         result.pop('pcc_attach', None)
@@ -423,8 +439,7 @@ class LpccMonitor:
         total_read_bytes = result.get('read_bytes', 0) \
                            + result.get('pcc_hit_bytes', 0) \
                            - result.get('pcc_attach_bytes', 0)
-        real_hit_bytes = result.get('pcc_hit_bytes', 0) \
-                         - result.get('pcc_attach_bytes', 0)
+        real_hit_bytes = result.get('pcc_hit_bytes', 0)
         result['pcc_real_hit_bytes'] = real_hit_bytes;
         result['total_read_bytes'] = total_read_bytes;
         result.pop('read_bytes', None)
@@ -565,16 +580,16 @@ class LpccCli:
         """
         Communicate with server and run a sub command
         """
-        try:
-            sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
-            sock.connect(LISTEN_SOCK_FN)
-        except socket.error:
+        if not os.path.exists(LISTEN_SOCK_FN):
             eprint("Socket file '%s' does not exist, " % LISTEN_SOCK_FN + \
                 "please check whether the monitor service started!")
             sys.exit(1)
 
+        sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
+        sock.connect(LISTEN_SOCK_FN)
         sock.sendall(json.dumps(cmd).encode())
         sock.shutdown(socket.SHUT_WR)
+
         response = sock.makefile().readline()
         sock.close()
 
index ba514b6..9a4f9da 100644 (file)
@@ -4852,13 +4852,14 @@ test_210() {
 
        do_facet $SINGLEAGT cat $config_file
 
+       stack_trap "do_facet $SINGLEAGT journalctl -u lpcc -n 100"
        do_facet $SINGLEAGT systemctl start lpcc
        stack_trap "do_facet $SINGLEAGT systemctl stop lpcc"
+       sleep 1
 
        do_facet $SINGLEAGT systemctl status lpcc ||
                error "lpcc system service did not start correctly"
 
-       sleep 2
        do_facet $SINGLEAGT lpcc status | grep '"status": "running"' ||
                error "lpcc status is wrong"
 
@@ -4867,7 +4868,6 @@ test_210() {
                error "lpcc status is wrong"
 
        do_facet $SINGLEAGT lpcc start-all
-       sleep 2
        do_facet $SINGLEAGT lpcc status | grep '"status": "running"' ||
                error "lpcc status is wrong"