tools/diff.py

   1 #!/usr/bin/python
   2 """HTML Diff: http://www.aaronsw.com/2002/diff
   3 Rough code, badly documented. Send me comments and patches."""
   4
   5 __author__ = 'Aaron Swartz <me@aaronsw.com>, \
   6 Richard Henwood <rhenwood@whamcloud.com>'
   7 __copyright__ = '(C) 2003 Aaron Swartz, (C) 2011,2012 Whamcloud, Inc \
   8 GNU GPL 2 or 3.'
   9 __version__ = '1.0'
  10
  11 import difflib
  12 import string
  13 import re
  14
  15 def isTag(x): return x[0] == "<" and x[-1] == ">"
  16
  17 def textDiff(a, b):
  18     """Takes in strings a and b and returns a human-readable HTML diff."""
  19     changeCounter = 0
  20
  21     out = []
  22     a, b = html2list(a), html2list(b)
  23     s = difflib.SequenceMatcher(None, a, b)
  24     gap = 0;
  25     wait = 7; # fudge factor to avoid having too many changes anchors.
  26     for e in s.get_opcodes():
  27         if e[0] == "replace":
  28             # @@ need to do something more complicated here
  29             # call textDiff but not for html, but for some html... ugh
  30             # gonna cop-out for now
  31             # I've put this hack in to try accomodate id's generated by
  32             # docbook compilation. RJH 30/12/2011
  33             if ('<a id="id' in a[e[1]:e[2]][0] or
  34                     '<a href="#id' in a[e[1]:e[2]][0] or
  35                     '<a id="ftn.id' in a[e[1]:e[2]][0] or
  36                     '<a class="indexterm" href="#id' in a[e[1]:e[2]][0] or
  37                     '<a id="id' in b[e[3]:e[4]][0] or
  38                     '<a href="#id' in b[e[3]:e[4]][0] or
  39                     '<a id="ftn.id' in b[e[3]:e[4]][0] or
  40                     '<a class="indexterm" href="#id' in b[e[3]:e[4]][0]):
  41                 out.append(''.join(b[e[3]:e[4]]))
  42             else:
  43                 if gap > wait:
  44                     out.append('<a name="change_%s"></a>' % changeCounter)
  45                     changeCounter += 1
  46                 withopen = addOpen(a[e[1]:e[2]])
  47                 out.append(annotateTags(withopen, 'delete'));
  48                 out.append(annotateTags(b[e[3]:e[4]], 'insert'));
  49                 gap = 0
  50         elif e[0] == "delete":
  51             if gap > wait:
  52                 out.append('<a name="change_%s"></a>' % changeCounter)
  53                 changeCounter += 1
  54             out.append(annotateTags(a[e[1]:e[2]], 'delete'));
  55             gap = 0
  56         elif e[0] == "insert":
  57             if gap > wait:
  58                 out.append('<a name="change_%s"></a>' % changeCounter)
  59                 changeCounter += 1
  60             out.append(annotateTags(b[e[3]:e[4]], 'insert'));
  61             gap = 0
  62         elif e[0] == "equal":
  63             out.append(''.join(b[e[3]:e[4]]))
  64             gap += 1
  65         else:
  66             raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'."
  67     return [''.join(out), changeCounter]
  68
  69 # this functiona finds all unmatched
  70 # close tags in htmllist and prepends
  71 # corresponding open tags to htmllist.
  72 def addOpen(htmllist):
  73     unmatched = []
  74     tmpstack = []
  75     for item in htmllist:
  76         if "<" in item:
  77             if "</" in item:
  78                 try:
  79                     if tmpstack[-1] == item:
  80                         tmpstack.pop()
  81                     else:
  82                         tmpstack.append(item)
  83                 except IndexError:
  84                     unmatched.append(item)
  85                     pass
  86                 pass
  87             else:
  88                 tmpstack.append(re.sub(r'<(\w+).*', r'</\g<1>>', item, flags=re.IGNORECASE))
  89                 pass
  90         else:
  91             pass
  92     unmatched = map(lambda x: x.replace('/', ''), unmatched)
  93     unmatched.reverse()
  94     return unmatched + htmllist
  95
  96 def annotateTags(htmllist, className):
  97     res = map(lambda tag: addClass(tag, className), htmllist)
  98     #the first element should be surrounded with a span + class
  99     #provided it is not a tag.
 100     if '<' not in htmllist[0]:
 101         res[0] = '<span class="%s">%s</span>' % (className, htmllist[0])
 102     return ''.join(res)
 103
 104 # this method adds a new class to a html tag. i.e.
 105 # <a id="idp9794432"> -> <a id="idp9794432" class='test'>
 106 # <a id="idp9794432" class='test'> -> <a id="idp9794432" class='test test2'>
 107 def addClass (tag, newClass):
 108     if "<" in tag and "</" not in tag:
 109         if 'class' not in tag:
 110             # assume all tags end in '>', so chop it off:
 111             tag = tag[:-1]
 112             # add in an empty class element.
 113             tag = '%s class="">' % tag
 114         tag = re.sub(r'(.*)class=[\'"]([ \w]*)[\'"]',
 115                 r'\g<1>class="\g<2> '+newClass+'"', tag , flags=re.IGNORECASE)
 116     return tag
 117
 118 def html2list(x, b=0):
 119     mode = 'char'
 120     cur = ''
 121     out = []
 122     for c in x:
 123         if mode == 'tag':
 124             if c == '>':
 125                 if b: cur += ']'
 126                 else: cur += c
 127                 out.append(cur); cur = ''; mode = 'char'
 128             else: cur += c
 129         elif mode == 'char':
 130             if c == '<':
 131                 out.append(cur)
 132                 if b: cur = '['
 133                 else: cur = c
 134                 mode = 'tag'
 135             elif c in string.whitespace: out.append(cur+c); cur = ''
 136             else: cur += c
 137     out.append(cur)
 138     return filter(lambda x: x is not '', out)
 139
 140 def test ():
 141     print "\nrunning tests..."
 142     test1res = addClass('<a id="idp9794432">', 'test')
 143     test1suc = '<a id="idp9794432" class=" test">'
 144     if test1res == test1suc:
 145         print "SUCCESS: %s == %s" % (test1res, test1suc)
 146     else:
 147         print "FAIL: %s != %s" % (test1res, test1suc)
 148     test2res = addClass('<a id="idp9794432" class="test">', 'test2')
 149     test2suc = '<a id="idp9794432" class="test test2">'
 150     if test2res == test2suc:
 151         print "SUCCESS: %s == %s" % (test2res, test2suc)
 152     else:
 153         print "FAIL: %s != %s" % (test2res, test2suc)
 154
 155     test3res = addOpen(['hello','</div>', '</p>'])
 156     test3suc = ['<p>', '<div>', 'hello','</div>', '</p>']
 157     if test3res == test3suc:
 158         print "SUCCESS: %s == %s" % (test3res, test3suc)
 159     else:
 160         print "FAIL: %s != %s" % (test3res, test3suc)
 161
 162     test4res = addOpen(['hello','</div>', '<p class="test">', 'test', '</p>', '</p>'])
 163     test4suc = ['<p>', '<div>', 'hello','</div>', '<p class="test">', 'test', '</p>', '</p>']
 164     if test4res == test4suc:
 165         print "SUCCESS: %s == %s" % (test4res, test4suc)
 166     else:
 167         print "FAIL: %s != %s" % (test4res, test4suc)
 168
 169     test5res = addOpen(['0','</p>', '</div>', '</div>', '<div>', 'test', '</div>'])
 170     test5suc = ['<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', 'test', '</div>']
 171     if test5res == test5suc:
 172         print "SUCCESS: %s == %s" % (test5res, test5suc)
 173     else:
 174         print "FAIL: %s != %s" % (test5res, test5suc)
 175
 176     test6res = annotateTags(['<div class="hello">', '0', '</div>'], 'test')
 177     test6suc = '<div class="hello test">0</div>'
 178     if test6res == test6suc:
 179         print "SUCCESS: %s == %s" % (test6res, test6suc)
 180     else:
 181         print "FAIL: %s != %s" % (test6res, test6suc)
 182
 183     test7res = addOpen(['0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>'])
 184     test7suc = ['<div>', '<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>']
 185     if test7res == test7suc:
 186         print "SUCCESS: %s == %s" % (test7res, test7suc)
 187     else:
 188         print "FAIL: %s != %s" % (test7res, test7suc)
 189
 190     pass
 191
 192 def jsHeader (changeCounter):
 193     jscript = """
 194 <script type='text/javascript'>
 195     var changes = %s;
 196     function showChanges()
 197     {
 198         var diffList, link, warn;
 199         diffList=document.getElementById('changeDiffs');
 200         for (var i = 0; i < changes; i += 1) {
 201             if (i == 0) {
 202                 diffList.innerHTML = "";
 203             }
 204             link = document.createElement('a');
 205             link.setAttribute('class', 'change_link');
 206             link.setAttribute('href', '#change_'+i);
 207             link.innerHTML = ' change #'+i;
 208             diffList.appendChild(link);
 209         }
 210         warn = document.createTextNode("NOTE: these changes are automatically detected and may be inaccurate.");
 211         diffList.appendChild(warn);
 212     }
 213 </script>
 214     """ % changeCounter
 215     return jscript;
 216
 217 def cssHeader ():
 218
 219     css = """<style type="text/css">
 220     .delete {
 221     background-color: Pink; text-decoration: line-through;
 222     }
 223
 224     .insert {
 225     background-color: PaleGreen;
 226     }
 227
 228     .change_link {
 229     padding:5px;
 230     }
 231
 232     </style>
 233     """
 234     return css
 235
 236
 237 if __name__ == '__main__':
 238     import sys
 239     try:
 240         a, b = sys.argv[1:3]
 241     except ValueError:
 242         print "htmldiff: highlight the differences between two html files"
 243         print "usage: " + sys.argv[0] + " a b"
 244         test()
 245         sys.exit(1)
 246     changeCounter = 0
 247     (diffxhtml, changeCounter) = textDiff(open(a).read(), open(b).read())
 248     # it is, regrettably, possible that the result of textDiff is not
 249     # valid xhtml. I have noticed issues with nested <p>'s for example.
 250     # Because of this, search and replace to insert our css, js and dom
 251     # elements:
 252     diffxhtml = diffxhtml.replace('</head><body>', cssHeader() + jsHeader(changeCounter) +
 253             '</head><body onload="showChanges()"><div id="changeDiffs">No changes detected.</div>')
 254     print diffxhtml