From: Lei Feng Date: Fri, 24 Jun 2022 02:34:31 +0000 (+0800) Subject: EX-5419 lipe: retry to get lpcc_purge stats file X-Git-Url: https://git.whamcloud.com/?a=commitdiff_plain;h=cdd499f7c9279c288b466179450e464ef4d51521;p=fs%2Flustre-release.git EX-5419 lipe: retry to get lpcc_purge stats file Sleep and retry to get lpcc_purge stats file after sending signal Correct statistics data algorithm. Signed-off-by: Lei Feng Test-Parameters: clientdistro=el8.5 testlist=sanity-pcc env=ONLY=210 Change-Id: I79e4b7de871f8ffd290e0c30da4780265fa7e9fa Reviewed-on: https://review.whamcloud.com/47729 Tested-by: jenkins Tested-by: Maloo Reviewed-by: Andreas Dilger --- diff --git a/lipe/lpcc b/lipe/lpcc index 1200793..a706d41 100755 --- a/lipe/lpcc +++ b/lipe/lpcc @@ -154,24 +154,40 @@ class LpccService: self._kill_process_by_pidfile(self.lpcc_purge_prog, pidfile) def _stats_lpcc_purge(self): + statsfile = '/var/run/lpcc_purge-%d.stats' % self.lpcc_roid + try: + os.unlink(statsfile) + except FileNotFoundError: + pass + pidfile = '/var/run/lpcc_purge-%d.pid' % self.lpcc_roid cmdline = ['pkill', '--signal', 'USR1', '--pidfile', pidfile, '--', 'lpcc_purge'] subprocess.call(cmdline) - statsfile = '/var/run/lpcc_purge-%d.stats' % self.lpcc_roid stats_obj = None - try: - with open(statsfile) as file_handler: - stats_obj = json.load(file_handler) - except Exception as err: - eprint(err) + retry_times = 5 + for i in range(0, retry_times + 1): + try: + with open(statsfile) as file_handler: + stats_obj = json.load(file_handler) + except FileNotFoundError: + if (i == retry_times): + eprint("cannot get stat file '%s' after trying for %d times" % + (statsfile, retry_times + 1)) + break + delay = 0.1 * pow(2, i) + time.sleep(delay) + except Exception as err: + eprint(err) + break return stats_obj def _dump_config(self): - eprint("========== Config ==========") - yaml.safe_dump(self.lpcc_config, sys.stdout, default_flow_style=False) - eprint("============================") + eprint("========== Single PCC Config ==========") + yaml.safe_dump(self.lpcc_config, stream=sys.stderr, default_flow_style=False) + eprint("=======================================") + sys.stderr.flush() def start(self): """ @@ -409,7 +425,7 @@ class LpccMonitor: result['read_bytes'] = int(words[6]) # adjust open times and calc hit ratio - real_hit = result.get('pcc_hit', 0) - result.get('pcc_attach', 0); + real_hit = result.get('pcc_hit', 0) result['pcc_real_hit'] = real_hit result.pop('pcc_hit', None) result.pop('pcc_attach', None) @@ -423,8 +439,7 @@ class LpccMonitor: total_read_bytes = result.get('read_bytes', 0) \ + result.get('pcc_hit_bytes', 0) \ - result.get('pcc_attach_bytes', 0) - real_hit_bytes = result.get('pcc_hit_bytes', 0) \ - - result.get('pcc_attach_bytes', 0) + real_hit_bytes = result.get('pcc_hit_bytes', 0) result['pcc_real_hit_bytes'] = real_hit_bytes; result['total_read_bytes'] = total_read_bytes; result.pop('read_bytes', None) @@ -565,16 +580,16 @@ class LpccCli: """ Communicate with server and run a sub command """ - try: - sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) - sock.connect(LISTEN_SOCK_FN) - except socket.error: + if not os.path.exists(LISTEN_SOCK_FN): eprint("Socket file '%s' does not exist, " % LISTEN_SOCK_FN + \ "please check whether the monitor service started!") sys.exit(1) + sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + sock.connect(LISTEN_SOCK_FN) sock.sendall(json.dumps(cmd).encode()) sock.shutdown(socket.SHUT_WR) + response = sock.makefile().readline() sock.close() diff --git a/lustre/tests/sanity-pcc.sh b/lustre/tests/sanity-pcc.sh index ba514b6..9a4f9da 100644 --- a/lustre/tests/sanity-pcc.sh +++ b/lustre/tests/sanity-pcc.sh @@ -4852,13 +4852,14 @@ test_210() { do_facet $SINGLEAGT cat $config_file + stack_trap "do_facet $SINGLEAGT journalctl -u lpcc -n 100" do_facet $SINGLEAGT systemctl start lpcc stack_trap "do_facet $SINGLEAGT systemctl stop lpcc" + sleep 1 do_facet $SINGLEAGT systemctl status lpcc || error "lpcc system service did not start correctly" - sleep 2 do_facet $SINGLEAGT lpcc status | grep '"status": "running"' || error "lpcc status is wrong" @@ -4867,7 +4868,6 @@ test_210() { error "lpcc status is wrong" do_facet $SINGLEAGT lpcc start-all - sleep 2 do_facet $SINGLEAGT lpcc status | grep '"status": "running"' || error "lpcc status is wrong"