From eabb3cf9803fd2dc8ecc88b9f8e3e254d592366f Mon Sep 17 00:00:00 2001
From: Richard Henwood <rhenwood@whamcloud.com>
Date: Wed, 20 Jul 2011 13:01:47 -0500
Subject: [PATCH] LUDOC-13 render diff of html manual to enhance reviewability.

Reviewing manual changes is challenging when only observing docbook
xml. This change provides code (./tools/diff.py) and a 'diff' make
target to generate a html page with annotated differences. An
example screenshot is attached to LUDOC-13.

Signed-off-by: Richard Henwood <rhenwood@whamcloud.com>
Change-Id: Ib19bae429235f9bf2b2b8d15a597baac8f12cc6e
---
 Makefile      | 32 ++++++++++++++++++++++++++++-
 tools/diff.py | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 97 insertions(+), 1 deletion(-)
 create mode 100755 tools/diff.py
diff --git a/Makefile b/Makefile
index 441b96f..b733cb3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,8 +1,13 @@
 SRC_XML=$(wildcard *.xml)
 SRC_IMG=$(wildcard figures/*.png)
 SRCS=$(SRC_XML) $(SRC_IMG)
+TEMP=/tmp
 
 TGT_BASE=lustre_manual
+MASTER_URL=http://build.whamcloud.com/job/lustre-manual/lastSuccessfulBuild/
+MASTER_XHTML=$(MASTER_URL)/artifact/_out/$(TGT_BASE).xhtml
+TGT_MASTER=$(TEMP)/mastermanual
+
 
 RNG_LIN=/usr/share/xml/docbook/schema/rng/5.0/docbookxi.rng
 RNG_MAC=/opt/local/share/xml/docbook/5.0/rng/docbookxi.rng
@@ -18,7 +23,7 @@ check: $(SRC_XML)
 	xmllint --noout --xinclude --noent --relaxng $(RNG) ./index.xml
 
 # Note: can't use "suffix" instead of "subst", because it keeps the '.'
-$(TGT_BASE).html $(TGT_BASE).fo: $(SRCS)
+$(TGT_BASE).html $(TGT_BASE).xhtml $(TGT_BASE).fo: $(SRCS)
 	xsltproc --stringparam fop1.extensions  1 \
 		--stringparam section.label.includes.component.label 1 \
 		--stringparam section.autolabel 1 \
@@ -32,9 +37,34 @@ $(TGT_BASE).pdf: $(TGT_BASE).fo
 .PHONY: html
 html: $(TGT_BASE).html
 
+.PHONY: xhtml
+xhtml: $(TGT_BASE).xhtml
+
 .PHONY: pdf
 pdf: $(TGT_BASE).pdf
 
+# get the git hash for the last successful build of the manual
+.PHONY: mastermanual.revision
+mastermanual.revision:
+	wget -O mastermanual.index $(MASTER_URL)
+	awk '/Revision/ { print $$NF }' mastermanual.index > mastermanual.revision
+
+# only fetch the full manual if we don't have it or the manual changed
+$(TGT_MASTER).xhtml: mastermanual.revision
+	if ! cmp -s mastermanual.revision $(TGT_MASTER).revision ; then\
+		wget -O $(TGT_MASTER).xhtml $(MASTER_XHTML) && \
+		mv mastermanual.revision $(TGT_MASTER).revision;\
+	fi
+
+.PHONY: diff
+diff: $(TGT_BASE).xhtml $(TGT_MASTER).xhtml
+	./tools/diff.py $(TGT_MASTER).xhtml $(TGT_BASE).xhtml > $(TGT_BASE).diff
+
+
 .PHONY: push
 push:
 	git push ssh://review.whamcloud.com:29418/doc/manual HEAD:refs/for/master
+
+.PHONY: clean
+clean:
+	rm $(TGT_BASE).html $(TGT_BASE).xhtml $(TGT_BASE).pdf
diff --git a/tools/diff.py b/tools/diff.py
new file mode 100755
index 0000000..0c8036c
--- /dev/null
+++ b/tools/diff.py
@@ -0,0 +1,66 @@
+#!/usr/bin/python
+"""HTML Diff: http://www.aaronsw.com/2002/diff
+Rough code, badly documented. Send me comments and patches."""
+
+__author__ = 'Aaron Swartz <me@aaronsw.com>, Richard Henwood <rhenwood@whamcloud.com>'
+__copyright__ = '(C) 2003 Aaron Swartz. GNU GPL 2 or 3.'
+__version__ = '0.23'
+
+import difflib, string
+
+def isTag(x): return x[0] == "<" and x[-1] == ">"
+
+def textDiff(a, b):
+    """Takes in strings a and b and returns a human-readable HTML diff."""
+
+    out = []
+    a, b = html2list(a), html2list(b)
+    s = difflib.SequenceMatcher(None, a, b)
+    for e in s.get_opcodes():
+        if e[0] == "replace":
+            # @@ need to do something more complicated here
+            # call textDiff but not for html, but for some html... ugh
+            # gonna cop-out for now
+            out.append('<span class="replace" style="background-color: Pink; text-decoration: line-through;">'+''.join(a[e[1]:e[2]]) + '</span><span class="insert" style="background-color: PaleGreen;">'+''.join(b[e[3]:e[4]])+"</span>")
+        elif e[0] == "delete":
+            out.append('<span class="del" style="background-color: Pink; text-decoration: line-through;">' + ''.join(a[e[1]:e[2]]) + "</span>")
+        elif e[0] == "insert":
+            out.append('<span class="ins" style="background-color: PaleGreen;">'+''.join(b[e[3]:e[4]]) + "</span>")
+        elif e[0] == "equal":
+            out.append(''.join(b[e[3]:e[4]]))
+        else:
+            raise "Um, something's broken. I didn't expect a '" + `e[0]` + "'."
+    return ''.join(out)
+
+def html2list(x, b=0):
+    mode = 'char'
+    cur = ''
+    out = []
+    for c in x:
+        if mode == 'tag':
+            if c == '>':
+                if b: cur += ']'
+                else: cur += c
+                out.append(cur); cur = ''; mode = 'char'
+            else: cur += c
+        elif mode == 'char':
+            if c == '<':
+                out.append(cur)
+                if b: cur = '['
+                else: cur = c
+                mode = 'tag'
+            elif c in string.whitespace: out.append(cur+c); cur = ''
+            else: cur += c
+    out.append(cur)
+    return filter(lambda x: x is not '', out)
+
+if __name__ == '__main__':
+    import sys
+    try:
+        a, b = sys.argv[1:3]
+    except ValueError:
+        print "htmldiff: highlight the differences between two html files"
+        print "usage: " + sys.argv[0] + " a b"
+        sys.exit(1)
+    print textDiff(open(a).read(), open(b).read())
+
-- 
1.8.3.1