Whamcloud - gitweb
LU-15428 contrib: add branch_comm 31/46031/3
authorJohn L. Hammond <jhammond@whamcloud.com>
Mon, 10 Jan 2022 16:59:02 +0000 (10:59 -0600)
committerOleg Drokin <green@whamcloud.com>
Mon, 7 Feb 2022 04:43:17 +0000 (04:43 +0000)
Add a branch comparison (branch_comm) to contrib/scripts.

Test-Parameters: trivial
Signed-off-by: John L. Hammond <jhammond@whamcloud.com>
Change-Id: I13c0b90a48d6d3215bf9959242c5671e83d27d7a
Reviewed-on: https://review.whamcloud.com/46031
Tested-by: jenkins <devops@whamcloud.com>
Tested-by: Maloo <maloo@whamcloud.com>
Tested-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Jian Yu <yujian@whamcloud.com>
Reviewed-by: Peter Jones <pjones@whamcloud.com>
Reviewed-by: Oleg Drokin <green@whamcloud.com>
contrib/scripts/branch_comm [new file with mode: 0755]

diff --git a/contrib/scripts/branch_comm b/contrib/scripts/branch_comm
new file mode 100755 (executable)
index 0000000..dd12725
--- /dev/null
@@ -0,0 +1,339 @@
+#!/usr/bin/env python2
+
+import re
+import subprocess
+import sys
+
+class Change(object):
+    def __init__(self):
+        self.commit = ''
+        self.author_name = ''
+        self.author_email = ''
+        self.author_date = 0
+        self.subject = ''
+        self.body = ''
+        self.number = 0
+        self.change_id = ''
+        self.reviewed_on = ''
+        self.lustre_commit = ''
+        self.lustre_change = ''
+        self.lustre_change_number = 0
+        self.cray_bug_id = ''
+        self.hpe_bug_id = ''
+        self._parent = self
+        self._rank = 0
+
+    def _find(self):
+        if self._parent != self:
+            self._parent = self._parent._find()
+
+        return self._parent
+
+    def _union(self, c2):
+        r1 = self._find()
+        r2 = c2._find()
+        if r1._rank > r2._rank:
+            r2._parent = r1
+        elif r1._rank < r2._rank:
+            r1._parent = r2
+        elif r1 != r2:
+            r2._parent = r1
+            r1._rank += 1
+
+
+GIT_LOG_FIELDS = ['commit', 'author_name', 'author_email', 'author_date', 'subject', 'body']
+GIT_LOG_KEYS = ['%H', '%an', '%ae', '%at', '%s', '%b']
+GIT_LOG_FORMAT = '%x1f'.join(GIT_LOG_KEYS) + '%x1e'
+
+def _change_from_record(rec):
+    change = Change()
+    change.__dict__.update(dict(zip(GIT_LOG_FIELDS, rec.split('\x1f'))))
+    change.author_date = long(change.author_date)
+    for line in change.body.splitlines():
+        # Sometimes we have 'key : value' so we strip both sides.
+        lis = line.split(':', 1)
+        if len(lis) == 2:
+            key = lis[0].strip()
+            val = lis[1].strip()
+            if key in ['Change-Id', 'Reviewed-on', 'Lustre-commit', 'Lustre-change', 'Cray-bug-id', 'HPE-bug-id']:
+                change.__dict__[key.replace('-', '_').lower()] = val
+
+    obj = re.match(r'[A-Za-z]+://[\w\.]+/(\d+)$', change.reviewed_on)
+    if obj:
+        change.number = int(obj.group(1))
+
+    obj = re.match(r'[A-Za-z]+://[\w\.]+/(\d+)$', change.lustre_change)
+    if obj:
+        change.lustre_change_number = int(obj.group(1))
+
+    return change
+
+
+def _head(lis):
+    if lis:
+        return lis[0]
+    else:
+        return None
+
+
+class Branch(object):
+    def __init__(self, name, paths):
+        self.name = name
+        self.paths = paths
+        self.log = [] # Oldest commit is first.
+        self.by_commit = {} # str -> change
+        self.by_subject = {} # str -> list of changes
+        self.by_change_id = {} # str -> list of changes
+        self.by_number = {} # str -> list of changes
+
+    def _add_change_from_record(self, rec):
+        # TODO Handle reverted commits.
+        change = _change_from_record(rec)
+        self.log.append(change)
+        assert change.commit
+        assert change.commit not in self.by_commit
+        self.by_commit[change.commit] = change
+
+        assert change.subject
+        lis = self.by_subject.setdefault(change.subject, [])
+        # XXX Do we want this?
+        # if lis:
+        #    lis[0]._union(change)
+        lis.append(change)
+
+        for bug_id in (change.cray_bug_id, change.hpe_bug_id):
+            if bug_id and (' ' in change.subject):
+                # Split subject in to issue and rest.
+                issue, rest = change.subject.split(None, 1)
+                # Make new subject using external bug id
+                subject = ' '.join((bug_id, rest))
+                lis = self.by_subject.setdefault(subject, [])
+                lis.append(change)
+
+        # Equivalate by change_id.
+        if change.change_id:
+            lis = self.by_change_id.setdefault(change.change_id, [])
+            if lis:
+                lis[0]._union(change)
+            lis.append(change)
+
+        # Equivalate by number (from reviewed_on).
+        if change.number:
+            lis = self.by_number.setdefault(change.number, [])
+            if lis:
+                lis[0]._union(change)
+            lis.append(change)
+
+    def load(self):
+        self.log = []
+        self.by_commit = {}
+        self.by_subject = {}
+        self.by_change_id = {}
+        self.by_number = {}
+
+        git_base = ['git'] # [, '--git-dir=' + self.path + '/.git']
+        # rc = subprocess.call(git_base + ['fetch', 'origin'])
+        # assert rc == 0
+
+        pipe = subprocess.Popen(git_base + ['log',
+                                            '--format=' + GIT_LOG_FORMAT,
+                                            '--reverse',
+                                            self.name
+                                            ] + self.paths,
+                                stdout=subprocess.PIPE)
+        out, _ = pipe.communicate()
+        rc = pipe.wait()
+        assert rc == 0
+
+        for rec in out.split('\x1e\n'):
+            if rec:
+                self._add_change_from_record(rec)
+
+    def find_port(self, change):
+        # Try to find a port of change in this branch. change may or
+        # may not belong to branch.
+        #
+        # TODO Return oldest member of equivalence class.
+        port = (self.by_commit.get(change.commit) or
+                self.by_commit.get(change.lustre_commit) or
+                self.by_commit.get(change.lustre_change) or # Handle misuse.
+                _head(self.by_change_id.get(change.change_id)) or
+                _head(self.by_change_id.get(change.lustre_commit)) or # ...
+                _head(self.by_change_id.get(change.lustre_change)) or
+                _head(self.by_number.get(change.number)) or # Do we need this?
+                _head(self.by_number.get(change.lustre_change_number)) or
+                _head(self.by_subject.get(change.subject))) # Do we want this?
+        if port:
+            return port._find()
+        else:
+            return None
+
+
+def branch_comm(b1, b2):
+    n1 = len(b1.log)
+    n2 = len(b2.log)
+    i1 = 0
+    i2 = 0
+    printed = set() # commits
+
+    def change_is_printed(c):
+        return (c.commit in printed) or (c.lustre_commit in printed)
+
+    def change_set_printed(c):
+        printed.add(c.commit)
+        if c.lustre_commit:
+            printed.add(c.lustre_commit)
+
+    # Suppress initial common commits.
+    while i1 < n1 and i2 < n2:
+        # XXX Should we use _find() on c1 and c2 here?
+        # XXX Or c1 = b1.find_port(c1)?
+        c1 = b1.log[i1]
+        c2 = b2.log[i2]
+        if c1.commit == c2.commit:
+            i1 += 1
+            i2 += 1
+            continue
+        else:
+            break
+
+    while i1 < n1 and i2 < n2:
+        c1 = b1.log[i1]
+        if change_is_printed(c1):
+            i1 += 1
+            continue
+
+        c2 = b2.log[i2]
+        if change_is_printed(c2):
+            i2 += 1
+            continue
+
+        p1 = b1.find_port(c2)
+        if p1 and change_is_printed(p1):
+            change_set_printed(c2)
+            i2 += 1
+            continue
+
+        p2 = b2.find_port(c1)
+        if p2 and change_is_printed(p2):
+            change_set_printed(c1)
+            i1 += 1
+            continue
+
+        # Neither of c1 and c2 has been printed, nor has any port or either.
+
+        # XXX Do we need c1._find() here?
+        if c1 == p1 or c2 == p2:
+            # c1 and c2 are ports of the same change.
+            change_set_printed(c1)
+            change_set_printed(c2)
+            if p1:
+                change_set_printed(p1)
+            if p2:
+                change_set_printed(p2)
+            i1 += 1
+            i2 += 1
+            # c1 is common to both branches.
+            print '\t\t%s\t%s' % (c1.commit, c1.subject) # TODO Add a '*' if subjects different...
+            continue
+
+        if p1 and not p2:
+            # b1 has c2, b2 does not have c1, (port of c2 must be after c1).
+            change_set_printed(c1)
+            i1 += 1
+            # c1 is unique to b1.
+            print '%s\t\t\t%s' % (c1.commit, c1.subject)
+            continue
+
+        if p2 and not p1:
+            # b2 has c1, b1 does not have c2, (port of c1 must be after c2).
+            change_set_printed(c2)
+            i2 += 1
+            # c2 is unique to b2.
+            print '\t%s\t\t%s' % (c2.commit, c2.subject)
+            continue
+
+        # Now neither is ported or both are ported (and the order is weird).
+        if p2:
+            change_set_printed(c1)
+            change_set_printed(p2)
+            i1 += 1
+            # c1 is common to both branches.
+            print '\t\t%s\t%s' % (c1.commit, c1.subject)
+            continue
+        else:
+            change_set_printed(c1)
+            i1 += 1
+            # c1 is unique to b1.
+            print '%s\t\t\t%s' % (c1.commit, c1.subject)
+            continue
+
+    for c1 in b1.log[i1:]:
+        if change_is_printed(c1):
+            continue
+
+        assert i2 == n2
+        # All commits from b2 have been printed. Therefore if c1 has
+        # been ported to b2 then the port has already been printed. So
+        # c1 is unique to b1 and must be printed.
+
+        change_set_printed(c1)
+        print '%s\t\t\t%s' % (c1.commit, c1.subject)
+
+    for c2 in b2.log[i2:]:
+        if change_is_printed(c2):
+            continue
+
+        assert i1 == n1
+        # ...
+        change_set_printed(c2)
+        print '\t%s\t\t%s' % (c2.commit, c2.subject)
+
+
+USAGE = """usage: '_PROGNAME_ BRANCH1 BRANCH2 [PATH]...'
+
+Compare commits to Lustre branches.
+
+Prints commits unique to BRANCH1 in column 1.
+Prints commits unique to BRANCH2 in column 2.
+Prints commits common to both branches in column 3.
+Prints commit subject in column 4.
+Skips initial common commits.
+
+The output format is inspired by comm(1). To filter commits by branch,
+pipe the output to awk. For example:
+  $ ... | awk -F'\\t' '$1 != ""' # only commits unique to BRANCH1
+  $ ... | awk -F'\\t' '$2 != ""' # only commits unique to BRANCH2
+  $ ... | awk -F'\\t' '$3 != ""' # only common commits
+  $ ... | awk -F'\\t' '$3 == ""' # exclude common commmits
+
+This assumes that both branches are in the repository that contains
+the current directory. To compare branches from different upstream
+repositories (for example 'origin/master' and 'other/b_post_cmd3') do:
+
+  $ cd fs/lustre-release
+  $ git fetch origin
+  $ git remote add other ...
+  $ git fetch other
+  $ _PROGNAME_ origin/master other/b_post_cmd3"""
+
+
+def main():
+    if len(sys.argv) < 3:
+        print >> sys.stderr, USAGE.replace('_PROGNAME_', sys.argv[0])
+        sys.exit(1)
+
+    paths = sys.argv[3:]
+
+    b1 = Branch(sys.argv[1], paths)
+    b1.load()
+
+    b2 = Branch(sys.argv[2], paths)
+    b2.load()
+
+    branch_comm(b1, b2)
+
+
+if __name__ == '__main__':
+    main()
+