#!/usr/bin/python """HTML Diff: http://www.aaronsw.com/2002/diff Rough code, badly documented. Send me comments and patches.""" __author__ = 'Aaron Swartz , \ Richard Henwood ' __copyright__ = '(C) 2003 Aaron Swartz, (C) 2011,2012 Whamcloud, Inc \ GNU GPL 2 or 3.' __version__ = '1.0' import difflib import string import re def isTag(x): return x[0] == "<" and x[-1] == ">" def textDiff(a, b): """Takes in strings a and b and returns a human-readable HTML diff.""" changeCounter = 0 out = [] a, b = html2list(a), html2list(b) s = difflib.SequenceMatcher(None, a, b) gap = 0; wait = 7; # fudge factor to avoid having too many changes anchors. for e in s.get_opcodes(): if e[0] == "replace": # @@ need to do something more complicated here # call textDiff but not for html, but for some html... ugh # gonna cop-out for now # I've put this hack in to try accomodate id's generated by # docbook compilation. RJH 30/12/2011 if ('' % changeCounter) changeCounter += 1 withopen = addOpen(a[e[1]:e[2]]) out.append(annotateTags(withopen, 'delete')); out.append(annotateTags(b[e[3]:e[4]], 'insert')); gap = 0 elif e[0] == "delete": if gap > wait: out.append('' % changeCounter) changeCounter += 1 out.append(annotateTags(a[e[1]:e[2]], 'delete')); gap = 0 elif e[0] == "insert": if gap > wait: out.append('' % changeCounter) changeCounter += 1 out.append(annotateTags(b[e[3]:e[4]], 'insert')); gap = 0 elif e[0] == "equal": out.append(''.join(b[e[3]:e[4]])) gap += 1 else: raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'." return [''.join(out), changeCounter] # this functiona finds all unmatched # close tags in htmllist and prepends # corresponding open tags to htmllist. def addOpen(htmllist): unmatched = [] tmpstack = [] for item in htmllist: if "<" in item: if ">', item, flags=re.IGNORECASE)) pass else: pass unmatched = map(lambda x: x.replace('/', ''), unmatched) unmatched.reverse() return unmatched + htmllist def annotateTags(htmllist, className): res = map(lambda tag: addClass(tag, className), htmllist) #the first element should be surrounded with a span + class #provided it is not a tag. if '<' not in htmllist[0]: res[0] = '%s' % (className, htmllist[0]) return ''.join(res) # this method adds a new class to a html tag. i.e. # -> # -> def addClass (tag, newClass): if "<" in tag and "', so chop it off: tag = tag[:-1] # add in an empty class element. tag = '%s class="">' % tag tag = re.sub(r'(.*)class=[\'"]([ \w]*)[\'"]', r'\g<1>class="\g<2> '+newClass+'"', tag , flags=re.IGNORECASE) return tag def html2list(x, b=0): mode = 'char' cur = '' out = [] for c in x: if mode == 'tag': if c == '>': if b: cur += ']' else: cur += c out.append(cur); cur = ''; mode = 'char' else: cur += c elif mode == 'char': if c == '<': out.append(cur) if b: cur = '[' else: cur = c mode = 'tag' elif c in string.whitespace: out.append(cur+c); cur = '' else: cur += c out.append(cur) return filter(lambda x: x is not '', out) def test (): print "\nrunning tests..." test1res = addClass('', 'test') test1suc = '' if test1res == test1suc: print "SUCCESS: %s == %s" % (test1res, test1suc) else: print "FAIL: %s != %s" % (test1res, test1suc) test2res = addClass('', 'test2') test2suc = '' if test2res == test2suc: print "SUCCESS: %s == %s" % (test2res, test2suc) else: print "FAIL: %s != %s" % (test2res, test2suc) test3res = addOpen(['hello','', '

']) test3suc = ['

', '

', 'hello','
', '

'] if test3res == test3suc: print "SUCCESS: %s == %s" % (test3res, test3suc) else: print "FAIL: %s != %s" % (test3res, test3suc) test4res = addOpen(['hello','', '

', 'test', '

', '

']) test4suc = ['

', '

', 'hello','
', '

', 'test', '

', '

'] if test4res == test4suc: print "SUCCESS: %s == %s" % (test4res, test4suc) else: print "FAIL: %s != %s" % (test4res, test4suc) test5res = addOpen(['0','

', '', '', '
', 'test', '
']) test5suc = ['
', '
', '

', '0','

', '
', '
', '
', 'test', '
'] if test5res == test5suc: print "SUCCESS: %s == %s" % (test5res, test5suc) else: print "FAIL: %s != %s" % (test5res, test5suc) test6res = annotateTags(['
', '0', '
'], 'test') test6suc = '
0
' if test6res == test6suc: print "SUCCESS: %s == %s" % (test6res, test6suc) else: print "FAIL: %s != %s" % (test6res, test6suc) test7res = addOpen(['0','

', '', '', '
', '
', 'test', '
', '
', '']) test7suc = ['
', '
', '
', '

', '0','

', '
', '
', '
', '
', 'test', '
', '
', '
'] if test7res == test7suc: print "SUCCESS: %s == %s" % (test7res, test7suc) else: print "FAIL: %s != %s" % (test7res, test7suc) pass def jsHeader (changeCounter): jscript = """ """ % changeCounter return jscript; def cssHeader (): css = """ """ return css if __name__ == '__main__': import sys try: a, b = sys.argv[1:3] except ValueError: print "htmldiff: highlight the differences between two html files" print "usage: " + sys.argv[0] + " a b" test() sys.exit(1) changeCounter = 0 (diffxhtml, changeCounter) = textDiff(open(a).read(), open(b).read()) # it is, regrettably, possible that the result of textDiff is not # valid xhtml. I have noticed issues with nested

's for example. # Because of this, search and replace to insert our css, js and dom # elements: diffxhtml = diffxhtml.replace('', cssHeader() + jsHeader(changeCounter) + '

No changes detected.
') print diffxhtml