LUDOC-13: enhanced diff to work with larger changes.

author Richard Henwood <rhenwood@whamcloud.com>

Fri, 10 Feb 2012 20:20:28 +0000 (14:20 -0600)

committer Richard Henwood <rhenwood@whamcloud.com>

Fri, 10 Feb 2012 20:20:28 +0000 (14:20 -0600)
author Richard Henwood <rhenwood@whamcloud.com>
Fri, 10 Feb 2012 20:20:28 +0000 (14:20 -0600)
committer Richard Henwood <rhenwood@whamcloud.com>
Fri, 10 Feb 2012 20:20:28 +0000 (14:20 -0600)
diff --git a/tools/diff.py b/tools/diff.py

index f407784..582b465 100755 (executable)
--- a/tools/diff.py
+++ b/tools/diff.py
@@ -2,46 +2,118 @@
  """HTML Diff: http://www.aaronsw.com/2002/diff
  Rough code, badly documented. Send me comments and patches."""
  
-__author__ = 'Aaron Swartz <me@aaronsw.com>, Richard Henwood <rhenwood@whamcloud.com>'
-__copyright__ = '(C) 2003 Aaron Swartz. GNU GPL 2 or 3.'
-__version__ = '0.23'
+__author__ = 'Aaron Swartz <me@aaronsw.com>, \
+Richard Henwood <rhenwood@whamcloud.com>'
+__copyright__ = '(C) 2003 Aaron Swartz, (C) 2011,2012 Whamcloud, Inc \
+GNU GPL 2 or 3.'
+__version__ = '1.0'
  
-import difflib, string
+import difflib
+import string
+import re
  
  def isTag(x): return x[0] == "<" and x[-1] == ">"
  
  def textDiff(a, b):
      """Takes in strings a and b and returns a human-readable HTML diff."""
+    changeCounter = 0
  
      out = []
      a, b = html2list(a), html2list(b)
      s = difflib.SequenceMatcher(None, a, b)
+    gap = 0;
+    wait = 7; # fudge factor to avoid having too many changes anchors.
      for e in s.get_opcodes():
          if e[0] == "replace":
              # @@ need to do something more complicated here
              # call textDiff but not for html, but for some html... ugh
              # gonna cop-out for now
-            # I've put this hack in to try accomodate id's generated by docbook compilation. RJH 30/12/2011
-            if '<a id="id' in a[e[1]:e[2]][0] or \
-                    '<a href="#id' in a[e[1]:e[2]][0] or \
-                    '<a id="ftn.id' in a[e[1]:e[2]][0] or \
-                    '<a class="indexterm" href="#id' in a[e[1]:e[2]][0] or \
-                    '<a id="id' in b[e[3]:e[4]][0] or \
-                    '<a href="#id' in b[e[3]:e[4]][0] or \
-                    '<a id="ftn.id' in b[e[3]:e[4]][0] or \
-                    '<a class="indexterm" href="#id' in b[e[3]:e[4]][0]:
+            # I've put this hack in to try accomodate id's generated by
+            # docbook compilation. RJH 30/12/2011
+            if ('<a id="id' in a[e[1]:e[2]][0] or
+                    '<a href="#id' in a[e[1]:e[2]][0] or
+                    '<a id="ftn.id' in a[e[1]:e[2]][0] or
+                    '<a class="indexterm" href="#id' in a[e[1]:e[2]][0] or
+                    '<a id="id' in b[e[3]:e[4]][0] or
+                    '<a href="#id' in b[e[3]:e[4]][0] or
+                    '<a id="ftn.id' in b[e[3]:e[4]][0] or
+                    '<a class="indexterm" href="#id' in b[e[3]:e[4]][0]):
                  out.append(''.join(b[e[3]:e[4]]))
              else:
-                out.append('<a name="change"><span class="replace" style="background-color: Pink; text-decoration: line-through;">'+''.join(a[e[1]:e[2]]) + '</span><span class="insert" style="background-color: PaleGreen;">'+''.join(b[e[3]:e[4]])+"</span></a>")
+                if gap > wait:
+                    out.append('<a name="change_%s"></a>' % changeCounter)
+                    changeCounter += 1
+                withopen = addOpen(a[e[1]:e[2]])
+                out.append(annotateTags(withopen, 'delete'));
+                out.append(annotateTags(b[e[3]:e[4]], 'insert'));
+                gap = 0
          elif e[0] == "delete":
-            out.append('<a name="change"><span class="del" style="background-color: Pink; text-decoration: line-through;">' + ''.join(a[e[1]:e[2]]) + "</span></a>")
+            if gap > wait:
+                out.append('<a name="change_%s"></a>' % changeCounter)
+                changeCounter += 1
+            out.append(annotateTags(a[e[1]:e[2]], 'delete'));
+            gap = 0
          elif e[0] == "insert":
-            out.append('<a name="change"><span class="ins" style="background-color: PaleGreen;">'+''.join(b[e[3]:e[4]]) + "</span></a>")
+            if gap > wait:
+                out.append('<a name="change_%s"></a>' % changeCounter)
+                changeCounter += 1
+            out.append(annotateTags(b[e[3]:e[4]], 'insert'));
+            gap = 0
          elif e[0] == "equal":
              out.append(''.join(b[e[3]:e[4]]))
+            gap += 1
          else:
              raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'."
-    return ''.join(out)
+    return [''.join(out), changeCounter]
+
+# this functiona finds all unmatched
+# close tags in htmllist and prepends
+# corresponding open tags to htmllist.
+def addOpen(htmllist):
+    unmatched = []
+    tmpstack = []
+    for item in htmllist:
+        if "<" in item:
+            if "</" in item:
+                try:
+                    if tmpstack[-1] == item:
+                        tmpstack.pop()
+                    else:
+                        tmpstack.append(item)
+                except IndexError:
+                    unmatched.append(item)
+                    pass
+                pass
+            else:
+                tmpstack.append(re.sub(r'<(\w+).*', r'</\g<1>>', item, flags=re.IGNORECASE))
+                pass
+        else:
+            pass
+    unmatched = map(lambda x: x.replace('/', ''), unmatched)
+    unmatched.reverse()
+    return unmatched + htmllist
+
+def annotateTags(htmllist, className):
+    res = map(lambda tag: addClass(tag, className), htmllist)
+    #the first element should be surrounded with a span + class
+    #provided it is not a tag.
+    if '<' not in htmllist[0]:
+        res[0] = '<span class="%s">%s</span>' % (className, htmllist[0])
+    return ''.join(res)
+
+# this method adds a new class to a html tag. i.e.
+# <a id="idp9794432"> -> <a id="idp9794432" class='test'>
+# <a id="idp9794432" class='test'> -> <a id="idp9794432" class='test test2'>
+def addClass (tag, newClass):
+    if "<" in tag and "</" not in tag:
+        if 'class' not in tag:
+            # assume all tags end in '>', so chop it off:
+            tag = tag[:-1]
+            # add in an empty class element.
+            tag = '%s class="">' % tag
+        tag = re.sub(r'(.*)class=[\'"]([ \w]*)[\'"]',
+                r'\g<1>class="\g<2> '+newClass+'"', tag , flags=re.IGNORECASE)
+    return tag
  
  def html2list(x, b=0):
      mode = 'char'
@@ -65,6 +137,103 @@ def html2list(x, b=0):
      out.append(cur)
      return filter(lambda x: x is not '', out)
  
+def test ():
+    print "\nrunning tests..."
+    test1res = addClass('<a id="idp9794432">', 'test')
+    test1suc = '<a id="idp9794432" class=" test">'
+    if test1res == test1suc:
+        print "SUCCESS: %s == %s" % (test1res, test1suc)
+    else:
+        print "FAIL: %s != %s" % (test1res, test1suc)
+    test2res = addClass('<a id="idp9794432" class="test">', 'test2')
+    test2suc = '<a id="idp9794432" class="test test2">'
+    if test2res == test2suc:
+        print "SUCCESS: %s == %s" % (test2res, test2suc)
+    else:
+        print "FAIL: %s != %s" % (test2res, test2suc)
+
+    test3res = addOpen(['hello','</div>', '</p>'])
+    test3suc = ['<p>', '<div>', 'hello','</div>', '</p>']
+    if test3res == test3suc:
+        print "SUCCESS: %s == %s" % (test3res, test3suc)
+    else:
+        print "FAIL: %s != %s" % (test3res, test3suc)
+
+    test4res = addOpen(['hello','</div>', '<p class="test">', 'test', '</p>', '</p>'])
+    test4suc = ['<p>', '<div>', 'hello','</div>', '<p class="test">', 'test', '</p>', '</p>']
+    if test4res == test4suc:
+        print "SUCCESS: %s == %s" % (test4res, test4suc)
+    else:
+        print "FAIL: %s != %s" % (test4res, test4suc)
+
+    test5res = addOpen(['0','</p>', '</div>', '</div>', '<div>', 'test', '</div>'])
+    test5suc = ['<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', 'test', '</div>']
+    if test5res == test5suc:
+        print "SUCCESS: %s == %s" % (test5res, test5suc)
+    else:
+        print "FAIL: %s != %s" % (test5res, test5suc)
+
+    test6res = annotateTags(['<div class="hello">', '0', '</div>'], 'test')
+    test6suc = '<div class="hello test">0</div>'
+    if test6res == test6suc:
+        print "SUCCESS: %s == %s" % (test6res, test6suc)
+    else:
+        print "FAIL: %s != %s" % (test6res, test6suc)
+
+    test7res = addOpen(['0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>'])
+    test7suc = ['<div>', '<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>']
+    if test7res == test7suc:
+        print "SUCCESS: %s == %s" % (test7res, test7suc)
+    else:
+        print "FAIL: %s != %s" % (test7res, test7suc)
+
+    pass
+
+def jsHeader (changeCounter):
+    jscript = """
+<script type='text/javascript'>
+    var changes = %s;
+    function showChanges()
+    {
+        var diffList, link, warn;
+        diffList=document.getElementById('changeDiffs');
+        for (var i = 0; i < changes; i += 1) {
+            if (i == 0) {
+                diffList.innerHTML = "";
+            }
+            link = document.createElement('a');
+            link.setAttribute('class', 'change_link');
+            link.setAttribute('href', '#change_'+i);
+            link.innerHTML = ' change #'+i;
+            diffList.appendChild(link);
+        }
+        warn = document.createTextNode("NOTE: these changes are automatically detected and may be inaccurate.");
+        diffList.appendChild(warn);
+    }
+</script>
+    """ % changeCounter
+    return jscript;
+
+def cssHeader ():
+
+    css = """<style type="text/css">
+    .delete {
+    background-color: Pink; text-decoration: line-through;
+    }
+
+    .insert {
+    background-color: PaleGreen;
+    }
+
+    .change_link {
+    padding:5px;
+    }
+
+    </style>
+    """
+    return css
+
+
  if __name__ == '__main__':
      import sys
      try:
@@ -72,6 +241,14 @@ if __name__ == '__main__':
      except ValueError:
          print "htmldiff: highlight the differences between two html files"
          print "usage: " + sys.argv[0] + " a b"
+        test()
          sys.exit(1)
-    print textDiff(open(a).read(), open(b).read())
-
+    changeCounter = 0
+    (diffxhtml, changeCounter) = textDiff(open(a).read(), open(b).read())
+    # it is, regrettably, possible that the result of textDiff is not
+    # valid xhtml. I have noticed issues with nested <p>'s for example.
+    # Because of this, search and replace to insert our css, js and dom
+    # elements:
+    diffxhtml = diffxhtml.replace('</head><body>', cssHeader() + jsHeader(changeCounter) +
+            '</head><body onload="showChanges()"><div id="changeDiffs">No changes detected.</div>')
+    print diffxhtml
author	Richard Henwood <rhenwood@whamcloud.com>
	Fri, 10 Feb 2012 20:20:28 +0000 (14:20 -0600)
committer	Richard Henwood <rhenwood@whamcloud.com>
	Fri, 10 Feb 2012 20:20:28 +0000 (14:20 -0600)