#!/usr/bin/python
"""HTML Diff: http://www.aaronsw.com/2002/diff
Rough code, badly documented. Send me comments and patches."""
__author__ = 'Aaron Swartz , \
Richard Henwood '
__copyright__ = '(C) 2003 Aaron Swartz, (C) 2011,2012 Whamcloud, Inc \
GNU GPL 2 or 3.'
__version__ = '1.0'
import difflib
import string
import re
def isTag(x): return x[0] == "<" and x[-1] == ">"
def textDiff(a, b):
"""Takes in strings a and b and returns a human-readable HTML diff."""
changeCounter = 0
out = []
a, b = html2list(a), html2list(b)
s = difflib.SequenceMatcher(None, a, b)
gap = 0;
wait = 7; # fudge factor to avoid having too many changes anchors.
for e in s.get_opcodes():
if e[0] == "replace":
# @@ need to do something more complicated here
# call textDiff but not for html, but for some html... ugh
# gonna cop-out for now
# I've put this hack in to try accomodate id's generated by
# docbook compilation. RJH 30/12/2011
if ('' % changeCounter)
changeCounter += 1
withopen = addOpen(a[e[1]:e[2]])
out.append(annotateTags(withopen, 'delete'));
out.append(annotateTags(b[e[3]:e[4]], 'insert'));
gap = 0
elif e[0] == "delete":
if gap > wait:
out.append('' % changeCounter)
changeCounter += 1
out.append(annotateTags(a[e[1]:e[2]], 'delete'));
gap = 0
elif e[0] == "insert":
if gap > wait:
out.append('' % changeCounter)
changeCounter += 1
out.append(annotateTags(b[e[3]:e[4]], 'insert'));
gap = 0
elif e[0] == "equal":
out.append(''.join(b[e[3]:e[4]]))
gap += 1
else:
raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'."
return [''.join(out), changeCounter]
# this functiona finds all unmatched
# close tags in htmllist and prepends
# corresponding open tags to htmllist.
def addOpen(htmllist):
unmatched = []
tmpstack = []
for item in htmllist:
if "<" in item:
if "" in item:
try:
if tmpstack[-1] == item:
tmpstack.pop()
else:
tmpstack.append(item)
except IndexError:
unmatched.append(item)
pass
pass
else:
tmpstack.append(re.sub(r'<(\w+).*', r'\g<1>>', item, flags=re.IGNORECASE))
pass
else:
pass
unmatched = map(lambda x: x.replace('/', ''), unmatched)
unmatched.reverse()
return unmatched + htmllist
def annotateTags(htmllist, className):
res = map(lambda tag: addClass(tag, className), htmllist)
#the first element should be surrounded with a span + class
#provided it is not a tag.
if '<' not in htmllist[0]:
res[0] = '%s' % (className, htmllist[0])
return ''.join(res)
# this method adds a new class to a html tag. i.e.
# ->
# ->
def addClass (tag, newClass):
if "<" in tag and "" not in tag:
if 'class' not in tag:
# assume all tags end in '>', so chop it off:
tag = tag[:-1]
# add in an empty class element.
tag = '%s class="">' % tag
tag = re.sub(r'(.*)class=[\'"]([ \w]*)[\'"]',
r'\g<1>class="\g<2> '+newClass+'"', tag , flags=re.IGNORECASE)
return tag
def html2list(x, b=0):
mode = 'char'
cur = ''
out = []
for c in x:
if mode == 'tag':
if c == '>':
if b: cur += ']'
else: cur += c
out.append(cur); cur = ''; mode = 'char'
else: cur += c
elif mode == 'char':
if c == '<':
out.append(cur)
if b: cur = '['
else: cur = c
mode = 'tag'
elif c in string.whitespace: out.append(cur+c); cur = ''
else: cur += c
out.append(cur)
return filter(lambda x: x is not '', out)
def test ():
print "\nrunning tests..."
test1res = addClass('', 'test')
test1suc = ''
if test1res == test1suc:
print "SUCCESS: %s == %s" % (test1res, test1suc)
else:
print "FAIL: %s != %s" % (test1res, test1suc)
test2res = addClass('', 'test2')
test2suc = ''
if test2res == test2suc:
print "SUCCESS: %s == %s" % (test2res, test2suc)
else:
print "FAIL: %s != %s" % (test2res, test2suc)
test3res = addOpen(['hello','', '
'])
test3suc = ['', '
', 'hello','
', '']
if test3res == test3suc:
print "SUCCESS: %s == %s" % (test3res, test3suc)
else:
print "FAIL: %s != %s" % (test3res, test3suc)
test4res = addOpen(['hello','', '', 'test', '
', ''])
test4suc = ['', '
', 'hello','
', '', 'test', '
', '']
if test4res == test4suc:
print "SUCCESS: %s == %s" % (test4res, test4suc)
else:
print "FAIL: %s != %s" % (test4res, test4suc)
test5res = addOpen(['0','', '', '', '', 'test', '
'])
test5suc = ['', '', 'test', '
']
if test5res == test5suc:
print "SUCCESS: %s == %s" % (test5res, test5suc)
else:
print "FAIL: %s != %s" % (test5res, test5suc)
test6res = annotateTags(['', '0', '
'], 'test')
test6suc = '0
'
if test6res == test6suc:
print "SUCCESS: %s == %s" % (test6res, test6suc)
else:
print "FAIL: %s != %s" % (test6res, test6suc)
test7res = addOpen(['0','', '', '', '', ''])
test7suc = ['']
if test7res == test7suc:
print "SUCCESS: %s == %s" % (test7res, test7suc)
else:
print "FAIL: %s != %s" % (test7res, test7suc)
pass
def jsHeader (changeCounter):
jscript = """
""" % changeCounter
return jscript;
def cssHeader ():
css = """
"""
return css
if __name__ == '__main__':
import sys
try:
a, b = sys.argv[1:3]
except ValueError:
print "htmldiff: highlight the differences between two html files"
print "usage: " + sys.argv[0] + " a b"
test()
sys.exit(1)
changeCounter = 0
(diffxhtml, changeCounter) = textDiff(open(a).read(), open(b).read())
# it is, regrettably, possible that the result of textDiff is not
# valid xhtml. I have noticed issues with nested 's for example.
# Because of this, search and replace to insert our css, js and dom
# elements:
diffxhtml = diffxhtml.replace('
', cssHeader() + jsHeader(changeCounter) +
'No changes detected.
')
print diffxhtml