"""HTML Diff: http://www.aaronsw.com/2002/diff
Rough code, badly documented. Send me comments and patches."""
-__author__ = 'Aaron Swartz <me@aaronsw.com>, Richard Henwood <rhenwood@whamcloud.com>'
-__copyright__ = '(C) 2003 Aaron Swartz. GNU GPL 2 or 3.'
-__version__ = '0.23'
+__author__ = 'Aaron Swartz <me@aaronsw.com>, \
+Richard Henwood <rhenwood@whamcloud.com>'
+__copyright__ = '(C) 2003 Aaron Swartz, (C) 2011,2012 Whamcloud, Inc \
+GNU GPL 2 or 3.'
+__version__ = '1.0'
-import difflib, string
+import difflib
+import string
+import re
def isTag(x): return x[0] == "<" and x[-1] == ">"
def textDiff(a, b):
"""Takes in strings a and b and returns a human-readable HTML diff."""
+ changeCounter = 0
out = []
a, b = html2list(a), html2list(b)
s = difflib.SequenceMatcher(None, a, b)
+ gap = 0;
+ wait = 7; # fudge factor to avoid having too many changes anchors.
for e in s.get_opcodes():
if e[0] == "replace":
# @@ need to do something more complicated here
# call textDiff but not for html, but for some html... ugh
# gonna cop-out for now
- # I've put this hack in to try accomodate id's generated by docbook compilation. RJH 30/12/2011
- if '<a id="id' in a[e[1]:e[2]][0] or \
- '<a href="#id' in a[e[1]:e[2]][0] or \
- '<a id="ftn.id' in a[e[1]:e[2]][0] or \
- '<a class="indexterm" href="#id' in a[e[1]:e[2]][0] or \
- '<a id="id' in b[e[3]:e[4]][0] or \
- '<a href="#id' in b[e[3]:e[4]][0] or \
- '<a id="ftn.id' in b[e[3]:e[4]][0] or \
- '<a class="indexterm" href="#id' in b[e[3]:e[4]][0]:
+ # I've put this hack in to try accomodate id's generated by
+ # docbook compilation. RJH 30/12/2011
+ if ('<a id="id' in a[e[1]:e[2]][0] or
+ '<a href="#id' in a[e[1]:e[2]][0] or
+ '<a id="ftn.id' in a[e[1]:e[2]][0] or
+ '<a class="indexterm" href="#id' in a[e[1]:e[2]][0] or
+ '<a id="id' in b[e[3]:e[4]][0] or
+ '<a href="#id' in b[e[3]:e[4]][0] or
+ '<a id="ftn.id' in b[e[3]:e[4]][0] or
+ '<a class="indexterm" href="#id' in b[e[3]:e[4]][0]):
- out.append('<a name="change"><span class="replace" style="background-color: Pink; text-decoration: line-through;">'+''.join(a[e[1]:e[2]]) + '</span><span class="insert" style="background-color: PaleGreen;">'+''.join(b[e[3]:e[4]])+"</span></a>")
+ if gap > wait:
+ out.append('<a name="change_%s"></a>' % changeCounter)
+ changeCounter += 1
+ withopen = addOpen(a[e[1]:e[2]])
+ out.append(annotateTags(withopen, 'delete'));
+ out.append(annotateTags(b[e[3]:e[4]], 'insert'));
+ gap = 0
elif e[0] == "delete":
- out.append('<a name="change"><span class="del" style="background-color: Pink; text-decoration: line-through;">' + ''.join(a[e[1]:e[2]]) + "</span></a>")
+ if gap > wait:
+ out.append('<a name="change_%s"></a>' % changeCounter)
+ changeCounter += 1
+ out.append(annotateTags(a[e[1]:e[2]], 'delete'));
+ gap = 0
elif e[0] == "insert":
- out.append('<a name="change"><span class="ins" style="background-color: PaleGreen;">'+''.join(b[e[3]:e[4]]) + "</span></a>")
+ if gap > wait:
+ out.append('<a name="change_%s"></a>' % changeCounter)
+ changeCounter += 1
+ out.append(annotateTags(b[e[3]:e[4]], 'insert'));
+ gap = 0
elif e[0] == "equal":
+ gap += 1
raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'."
- return ''.join(out)
+ return [''.join(out), changeCounter]
+# this functiona finds all unmatched
+# close tags in htmllist and prepends
+# corresponding open tags to htmllist.
+def addOpen(htmllist):
+ unmatched = []
+ tmpstack = []
+ for item in htmllist:
+ if "<" in item:
+ if "</" in item:
+ try:
+ if tmpstack[-1] == item:
+ tmpstack.pop()
+ else:
+ tmpstack.append(item)
+ except IndexError:
+ unmatched.append(item)
+ pass
+ pass
+ else:
+ tmpstack.append(re.sub(r'<(\w+).*', r'</\g<1>>', item, flags=re.IGNORECASE))
+ pass
+ else:
+ pass
+ unmatched = map(lambda x: x.replace('/', ''), unmatched)
+ unmatched.reverse()
+ return unmatched + htmllist
+def annotateTags(htmllist, className):
+ res = map(lambda tag: addClass(tag, className), htmllist)
+ #the first element should be surrounded with a span + class
+ #provided it is not a tag.
+ if '<' not in htmllist[0]:
+ res[0] = '<span class="%s">%s</span>' % (className, htmllist[0])
+ return ''.join(res)
+# this method adds a new class to a html tag. i.e.
+# <a id="idp9794432"> -> <a id="idp9794432" class='test'>
+# <a id="idp9794432" class='test'> -> <a id="idp9794432" class='test test2'>
+def addClass (tag, newClass):
+ if "<" in tag and "</" not in tag:
+ if 'class' not in tag:
+ # assume all tags end in '>', so chop it off:
+ tag = tag[:-1]
+ # add in an empty class element.
+ tag = '%s class="">' % tag
+ tag = re.sub(r'(.*)class=[\'"]([ \w]*)[\'"]',
+ r'\g<1>class="\g<2> '+newClass+'"', tag , flags=re.IGNORECASE)
+ return tag
def html2list(x, b=0):
mode = 'char'
return filter(lambda x: x is not '', out)
+def test ():
+ print "\nrunning tests..."
+ test1res = addClass('<a id="idp9794432">', 'test')
+ test1suc = '<a id="idp9794432" class=" test">'
+ if test1res == test1suc:
+ print "SUCCESS: %s == %s" % (test1res, test1suc)
+ else:
+ print "FAIL: %s != %s" % (test1res, test1suc)
+ test2res = addClass('<a id="idp9794432" class="test">', 'test2')
+ test2suc = '<a id="idp9794432" class="test test2">'
+ if test2res == test2suc:
+ print "SUCCESS: %s == %s" % (test2res, test2suc)
+ else:
+ print "FAIL: %s != %s" % (test2res, test2suc)
+ test3res = addOpen(['hello','</div>', '</p>'])
+ test3suc = ['<p>', '<div>', 'hello','</div>', '</p>']
+ if test3res == test3suc:
+ print "SUCCESS: %s == %s" % (test3res, test3suc)
+ else:
+ print "FAIL: %s != %s" % (test3res, test3suc)
+ test4res = addOpen(['hello','</div>', '<p class="test">', 'test', '</p>', '</p>'])
+ test4suc = ['<p>', '<div>', 'hello','</div>', '<p class="test">', 'test', '</p>', '</p>']
+ if test4res == test4suc:
+ print "SUCCESS: %s == %s" % (test4res, test4suc)
+ else:
+ print "FAIL: %s != %s" % (test4res, test4suc)
+ test5res = addOpen(['0','</p>', '</div>', '</div>', '<div>', 'test', '</div>'])
+ test5suc = ['<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', 'test', '</div>']
+ if test5res == test5suc:
+ print "SUCCESS: %s == %s" % (test5res, test5suc)
+ else:
+ print "FAIL: %s != %s" % (test5res, test5suc)
+ test6res = annotateTags(['<div class="hello">', '0', '</div>'], 'test')
+ test6suc = '<div class="hello test">0</div>'
+ if test6res == test6suc:
+ print "SUCCESS: %s == %s" % (test6res, test6suc)
+ else:
+ print "FAIL: %s != %s" % (test6res, test6suc)
+ test7res = addOpen(['0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>'])
+ test7suc = ['<div>', '<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>']
+ if test7res == test7suc:
+ print "SUCCESS: %s == %s" % (test7res, test7suc)
+ else:
+ print "FAIL: %s != %s" % (test7res, test7suc)
+ pass
+def jsHeader (changeCounter):
+ jscript = """
+<script type='text/javascript'>
+ var changes = %s;
+ function showChanges()
+ {
+ var diffList, link, warn;
+ diffList=document.getElementById('changeDiffs');
+ for (var i = 0; i < changes; i += 1) {
+ if (i == 0) {
+ diffList.innerHTML = "";
+ }
+ link = document.createElement('a');
+ link.setAttribute('class', 'change_link');
+ link.setAttribute('href', '#change_'+i);
+ link.innerHTML = ' change #'+i;
+ diffList.appendChild(link);
+ }
+ warn = document.createTextNode("NOTE: these changes are automatically detected and may be inaccurate.");
+ diffList.appendChild(warn);
+ }
+ """ % changeCounter
+ return jscript;
+def cssHeader ():
+ css = """<style type="text/css">
+ .delete {
+ background-color: Pink; text-decoration: line-through;
+ }
+ .insert {
+ background-color: PaleGreen;
+ }
+ .change_link {
+ padding:5px;
+ }
+ </style>
+ """
+ return css
if __name__ == '__main__':
import sys
except ValueError:
print "htmldiff: highlight the differences between two html files"
print "usage: " + sys.argv[0] + " a b"
+ test()
- print textDiff(open(a).read(), open(b).read())
+ changeCounter = 0
+ (diffxhtml, changeCounter) = textDiff(open(a).read(), open(b).read())
+ # it is, regrettably, possible that the result of textDiff is not
+ # valid xhtml. I have noticed issues with nested <p>'s for example.
+ # Because of this, search and replace to insert our css, js and dom
+ # elements:
+ diffxhtml = diffxhtml.replace('</head><body>', cssHeader() + jsHeader(changeCounter) +
+ '</head><body onload="showChanges()"><div id="changeDiffs">No changes detected.</div>')
+ print diffxhtml