2 """HTML Diff: http://www.aaronsw.com/2002/diff
3 Rough code, badly documented. Send me comments and patches."""
5 __author__ = 'Aaron Swartz <me@aaronsw.com>, \
6 Richard Henwood <rhenwood@whamcloud.com>'
7 __copyright__ = '(C) 2003 Aaron Swartz, (C) 2011,2012 Whamcloud, Inc \
15 def isTag(x): return x[0] == "<" and x[-1] == ">"
18 """Takes in strings a and b and returns a human-readable HTML diff."""
22 a, b = html2list(a), html2list(b)
23 s = difflib.SequenceMatcher(None, a, b)
25 wait = 7; # fudge factor to avoid having too many changes anchors.
26 for e in s.get_opcodes():
28 # @@ need to do something more complicated here
29 # call textDiff but not for html, but for some html... ugh
30 # gonna cop-out for now
31 # I've put this hack in to try accomodate id's generated by
32 # docbook compilation. RJH 30/12/2011
33 if ('<a id="id' in a[e[1]:e[2]][0] or
34 '<a href="#id' in a[e[1]:e[2]][0] or
35 '<a id="ftn.id' in a[e[1]:e[2]][0] or
36 '<a class="indexterm" href="#id' in a[e[1]:e[2]][0] or
37 '<a id="id' in b[e[3]:e[4]][0] or
38 '<a href="#id' in b[e[3]:e[4]][0] or
39 '<a id="ftn.id' in b[e[3]:e[4]][0] or
40 '<a class="indexterm" href="#id' in b[e[3]:e[4]][0]):
41 out.append(''.join(b[e[3]:e[4]]))
44 out.append('<a name="change_%s"></a>' % changeCounter)
46 withopen = addOpen(a[e[1]:e[2]])
47 out.append(annotateTags(withopen, 'delete'));
48 out.append(annotateTags(b[e[3]:e[4]], 'insert'));
50 elif e[0] == "delete":
52 out.append('<a name="change_%s"></a>' % changeCounter)
54 out.append(annotateTags(a[e[1]:e[2]], 'delete'));
56 elif e[0] == "insert":
58 out.append('<a name="change_%s"></a>' % changeCounter)
60 out.append(annotateTags(b[e[3]:e[4]], 'insert'));
63 out.append(''.join(b[e[3]:e[4]]))
66 raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'."
67 return [''.join(out), changeCounter]
69 # this functiona finds all unmatched
70 # close tags in htmllist and prepends
71 # corresponding open tags to htmllist.
72 def addOpen(htmllist):
79 if tmpstack[-1] == item:
84 unmatched.append(item)
88 tmpstack.append(re.sub(r'<(\w+).*', r'</\g<1>>', item, flags=re.IGNORECASE))
92 unmatched = map(lambda x: x.replace('/', ''), unmatched)
94 return unmatched + htmllist
96 def annotateTags(htmllist, className):
97 res = map(lambda tag: addClass(tag, className), htmllist)
98 #the first element should be surrounded with a span + class
99 #provided it is not a tag.
100 if '<' not in htmllist[0]:
101 res[0] = '<span class="%s">%s</span>' % (className, htmllist[0])
104 # this method adds a new class to a html tag. i.e.
105 # <a id="idp9794432"> -> <a id="idp9794432" class='test'>
106 # <a id="idp9794432" class='test'> -> <a id="idp9794432" class='test test2'>
107 def addClass (tag, newClass):
108 if "<" in tag and "</" not in tag:
109 if 'class' not in tag:
110 # assume all tags end in '>', so chop it off:
112 # add in an empty class element.
113 tag = '%s class="">' % tag
114 tag = re.sub(r'(.*)class=[\'"]([ \w]*)[\'"]',
115 r'\g<1>class="\g<2> '+newClass+'"', tag , flags=re.IGNORECASE)
118 def html2list(x, b=0):
127 out.append(cur); cur = ''; mode = 'char'
135 elif c in string.whitespace: out.append(cur+c); cur = ''
138 return filter(lambda x: x is not '', out)
141 print "\nrunning tests..."
142 test1res = addClass('<a id="idp9794432">', 'test')
143 test1suc = '<a id="idp9794432" class=" test">'
144 if test1res == test1suc:
145 print "SUCCESS: %s == %s" % (test1res, test1suc)
147 print "FAIL: %s != %s" % (test1res, test1suc)
148 test2res = addClass('<a id="idp9794432" class="test">', 'test2')
149 test2suc = '<a id="idp9794432" class="test test2">'
150 if test2res == test2suc:
151 print "SUCCESS: %s == %s" % (test2res, test2suc)
153 print "FAIL: %s != %s" % (test2res, test2suc)
155 test3res = addOpen(['hello','</div>', '</p>'])
156 test3suc = ['<p>', '<div>', 'hello','</div>', '</p>']
157 if test3res == test3suc:
158 print "SUCCESS: %s == %s" % (test3res, test3suc)
160 print "FAIL: %s != %s" % (test3res, test3suc)
162 test4res = addOpen(['hello','</div>', '<p class="test">', 'test', '</p>', '</p>'])
163 test4suc = ['<p>', '<div>', 'hello','</div>', '<p class="test">', 'test', '</p>', '</p>']
164 if test4res == test4suc:
165 print "SUCCESS: %s == %s" % (test4res, test4suc)
167 print "FAIL: %s != %s" % (test4res, test4suc)
169 test5res = addOpen(['0','</p>', '</div>', '</div>', '<div>', 'test', '</div>'])
170 test5suc = ['<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', 'test', '</div>']
171 if test5res == test5suc:
172 print "SUCCESS: %s == %s" % (test5res, test5suc)
174 print "FAIL: %s != %s" % (test5res, test5suc)
176 test6res = annotateTags(['<div class="hello">', '0', '</div>'], 'test')
177 test6suc = '<div class="hello test">0</div>'
178 if test6res == test6suc:
179 print "SUCCESS: %s == %s" % (test6res, test6suc)
181 print "FAIL: %s != %s" % (test6res, test6suc)
183 test7res = addOpen(['0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>'])
184 test7suc = ['<div>', '<div>', '<div>', '<p>', '0','</p>', '</div>', '</div>', '<div>', '<div hello="world">', 'test', '</div>', '</div>', '</div>']
185 if test7res == test7suc:
186 print "SUCCESS: %s == %s" % (test7res, test7suc)
188 print "FAIL: %s != %s" % (test7res, test7suc)
192 def jsHeader (changeCounter):
194 <script type='text/javascript'>
196 function showChanges()
198 var diffList, link, warn;
199 diffList=document.getElementById('changeDiffs');
200 for (var i = 0; i < changes; i += 1) {
202 diffList.innerHTML = "";
204 link = document.createElement('a');
205 link.setAttribute('class', 'change_link');
206 link.setAttribute('href', '#change_'+i);
207 link.innerHTML = ' change #'+i;
208 diffList.appendChild(link);
210 warn = document.createTextNode("NOTE: these changes are automatically detected and may be inaccurate.");
211 diffList.appendChild(warn);
219 css = """<style type="text/css">
221 background-color: Pink; text-decoration: line-through;
225 background-color: PaleGreen;
237 if __name__ == '__main__':
242 print "htmldiff: highlight the differences between two html files"
243 print "usage: " + sys.argv[0] + " a b"
247 (diffxhtml, changeCounter) = textDiff(open(a).read(), open(b).read())
248 # it is, regrettably, possible that the result of textDiff is not
249 # valid xhtml. I have noticed issues with nested <p>'s for example.
250 # Because of this, search and replace to insert our css, js and dom
252 diffxhtml = diffxhtml.replace('</head><body>', cssHeader() + jsHeader(changeCounter) +
253 '</head><body onload="showChanges()"><div id="changeDiffs">No changes detected.</div>')