;;; diff.ss -- displays differences between two files ;;; John David Stone ;;; Department of Mathematics and Computer Science ;;; Grinnell College ;;; stone@math.grin.edu ;;; Created April 5, 1998 ;;; Last revised June 10, 1998 ;; The object of this program is to compare the contents of two files, line ;; by line, and to report any lines that differ between the files in a ;; convenient format. The program not only detects changes within a line, ;; but also detects insertions and deletions that alter the positions of ;; lines within the file. ;; The DIFF procedure takes two strings as arguments, giving the names of ;; the files to be compared. It reads in each file, obtaining a list of ;; the lines in the file, and submits the resulting lists to the ;; PLAN-EDITING procedure. PLAN-EDITING returns a list of ``editing ;; instructions'' for converting the first file into the second one. This ;; list is the forwarded to the DISPLAY-READABLY procedure to be output in ;; a more readable format. (define diff (lambda (source-name target-name) ;; Check that both arguments are strings. (if (not (and (string? source-name) (string? target-name))) (error 'diff "Both arguments must be strings")) ;; In Chez Scheme, it is possible to check whether both arguments name ;; existing files. (if (not (and (file-exists? source-name) (file-exists? target-name))) (error 'diff "Both arguments must name existing files")) ;; Read in the files, construct a list of editing instructions ;; describing their differences, and display that list readably. (display-readably (plan-editing (read-file source-name) (read-file target-name))))) ;; The READ-FILE procedure takes the name of a file as its argument, opens ;; the file, reads in the file line by line, and stores the lines in a ;; list, which it returns. (define read-file (lambda (file-name) ;; Open the file. (let ((in (open-input-file file-name))) ;; Read it in a line at a time. (let loop ((so-far '()) (next-line (read-line in))) (if (eof-object? next-line) ;; When the end of the file is reached, close the input port ;; and return the list of lines read, reversing it to restore ;; their original order. (begin (close-input-port in) (reverse so-far)) ;; Until then, add each line to the list of lines read and read ;; in another. (loop (cons next-line so-far) (read-line in))))))) ;; Given an input port, the READ-LINE procedure recovers all of the ;; characters on one line of text that can be read in through that port and ;; returns those characters as a single string. The terminating newline ;; character is discarded. If no more text can be read in, READ-LINE ;; returns the end-of-file object. (define read-line (lambda (in) ;; Read a character at a time from the input port. (let loop ((so-far '()) (next-char (read-char in))) ;; When the data run out, return the end-of-file object. (cond ((eof-object? next-char) next-char) ;; At the end of a line, reverse the list of characters read ;; and package them as a string. ((char=? next-char #\newline) (list->string (reverse so-far))) ;; Until then, add the character to the list and read in the ;; next character. (else (loop (cons next-char so-far) (read-char in))))))) ;; The PLAN-EDITING procedure takes as arguments two lists, SOURCE and ;; TARGET, and returns a list of ``editing instructions'' for converting ;; SOURCE into a duplicate of TARGET. An editing instruction is a list of ;; four elements; the first is a non-negative integer indicating the ;; starting and ending positions within (the original version of) the list ;; of lines from the first file at which the editing operation is to take ;; place, the second a list of objects to be removed from the first file at ;; that point, and the third a list of objects to be inserted into the ;; first file at that point. For example, here is a sequence of editing ;; instructions for converting '("A" "B" "C" "D" "E" "F") into '("B" "C" ;; "F" "D" "E" "G"): ;; ((0 1 ("A") ()) ;; (3 3 () ("F")) ;; (5 6 ("F") ("G"))) ;; Of course, it is always possible to convert SOURCE into TARGET in more ;; than one way. The PLAN-EDITING procedure always returns a plan that ;; involves the smallest possible total of objects removed and objects ;; inserted. Moreover, once this plan is chosen, it is compressed into the ;; smallest number of independent editing instructions. ;; The procedure works by building up a variety of partial plans, which are ;; gradually extended one step at a time until the optimal plan can be ;; identified. The value of local variable PLANS is a list of such partial ;; plans. Within this list, the first plan indicates how to convert some ;; part of SOURCE into a null list, the second indicates how to convert it ;; into a list containing just the last element of TARGET, the third ;; indicates how to convert it into a list containing just the last two ;; elements of TARGET, and so on -- the last element of PLANS indicates how ;; to convert some part of SOURCE into the whole of TARGET. ;; Initially, the part of SOURCE that is the starting point for the plans ;; stored in PLANS is the empty list. (The BUILD-INITIAL-PLANS procedure ;; constructs these plans, which involve only insertions.) On the first ;; call to the internal procedure LOOP, the base is a list containing just ;; the last element of SOURCE; on the second call, the base is a list ;; containing just the last two elements of SOURCE, and so on -- on the ;; last iteration, the plans indicate how to convert the whole of SOURCE ;; into the various parts of TARGET, and the last element of this list, ;; which is the plan for converting the whole of SOURCE into the whole of ;; TARGET, is the optimal plan to do so. ;; The PLAN-EDITING procedure invokes REVISE-PLANS on each iteration to ;; extend the partial plans for one base to partial plans for a new base ;; containing one more element from SOURCE. (define plan-editing (lambda (source target) ;; Make sure that both arguments are lists. (if (not (and (list? source) (list? target))) (error 'plan-editing "Both arguments must be lists")) (let ((source-length (length source)) (reversed-target (reverse target))) ;; Build an initial set of partial plans, reverse the SOURCE list to ;; make it easier to access its elements in the opposite of their ;; original order, and set POSITION-IN-SOURCE initially to the ;; zero-based position of the last element. (let loop ((plans (build-initial-plans source-length target)) (rest-of-source (reverse source)) (position-in-source (- source-length 1))) (if (null? rest-of-source) ;; When there are no more elements of SOURCE to consider, the ;; partial plans deal with the whole of SOURCE. Select the ;; last one (which converts the whole of SOURCE into the whole ;; of TARGET), compress it, and return the result. (compress-plan (last plans)) ;; Until then, replace the collection of partial plans with a ;; new collection, reflecting the presence of one more element ;; of SOURCE; discard that element of SOURCE and subtract one ;; from POSITION-OF-SOURCE to reflect the fact that on the next ;; iteration we'll back up one more position in the original ;; (unreversed) list. (loop (revise-plans plans (car rest-of-source) position-in-source reversed-target) (cdr rest-of-source) (- position-in-source 1))))))) ;; The BUILD-INITIAL-PLANS procedure takes two arguments, the first of ;; which is the number of elements in the source list and the second the ;; target list, and returns a set of initial plans, each consisting of a ;; sequence of editing instructions for inserting a subset of the elements ;; of the target list. For instance, given the number 3 and the target ;; list ("A" "B" "C"), BUILD-INITIAL-PLANS would return the following list ;; of four plans: ;; (() ;; ((3 3 () ("C"))) ;; ((3 3 () ("B")) (3 3 () ("C"))) ;; ((3 3 () ("A")) (3 3 () ("B")) (3 3 () ("C")))) ;; The first of these plans converts an empty source list into an empty ;; target list. The second converts an empty source list into the partial ;; target list ("C"). The third converts an empty source list into the ;; partial target list ("B" "C"). Finally, the fourth converts an empty ;; source list into the whole target list ("A" "B" "C"). ;; BUILD-INITIAL-PLANS constructs the list of plans from right to left; it ;; starts by building the longest plan (this is what the call to MAP does), ;; and proceeds by repeatedly prepending a plan that has been reduced by ;; the removal of one editing instruction. (define build-initial-plans (lambda (len target) (let loop ((result (list (map (lambda (elm) (list len len '() (list elm))) target))) (rest target)) (if (null? rest) result (loop (cons (cdar result) result) (cdr rest)))))) ;; The REVISE-PLANS procedure takes four arguments: the previous collection ;; of partial plans, the new element of SOURCE to be taken into account in ;; editing, its (zero-based) position within SOURCE, and the entire target ;; list. It constructs and returns a new collection of partial plans; the ;; partial plan in position k in this list describes how to convert the ;; part of SOURCE that begins with the new element into the part of TARGET ;; that consists of its last k elements. (define revise-plans (lambda (old-plans source-element position target) ;; The partial plan that will wind up in position 0 consists entirely ;; of deletion steps and is formed (during the initialization of the ;; local variable NEW-PLANS) by prepending a new editing instruction, ;; in which the new element from SOURCE is deleted, to the partial plan ;; in position 0 of OLD-PLANS (which deletes all of the subsequent ;; elements of SOURCE). ;; The LOOP internal procedure contains two additional parameters, ;; REST-OF-TARGET and REST-OF-OLD-PLANS, because on each iteration one ;; element of TARGET and one of the old partial plans becomes ;; irrelevant to the computation of the next new partial plan. These ;; parameters are adjusted on each iteration to eliminate the items ;; that have become irrelevant. (let loop ((new-plans (list (cons (list position (+ position 1) (list source-element) '()) (car old-plans)))) (rest-of-target target) (rest-of-old-plans old-plans)) ;; The elements of TARGET are then considered one at a time until ;; they are exhausted. At that point, NEW-PLANS is reversed to bring ;; the partial plans into the correct order, and the reversed list is ;; returned. (cond ((null? rest-of-target) (reverse new-plans)) ;; Until then, each element of target is compared to the new ;; element from SOURCE. If it matches (as determined by ;; STRING=?), no additional editing is needed, and the ;; appropriate partial plan is recovered from OLD-PLANS and ;; added to NEW-PLANS. ((string=? source-element (car rest-of-target)) (loop (cons (car rest-of-old-plans) new-plans) (cdr rest-of-target) (cdr rest-of-old-plans))) ;; However, if the new element of SOURCE does not match the ;; current element from TARGET, the difference might be ;; accommodated in either of three ways: by deleting the new ;; element of SOURCE, by inserting the current element from ;; TARGET, or by simultaneously doing both -- in effect, ;; changing the new element of SOURCE into the current element ;; of TARGET. The SELECT-PLAN procedure is invoked to ;; determine which of these alternatives yields the shortest ;; and simplest overall plan. Whichever plan it returns is ;; added to NEW-PLANS. ;; The tricky part is getting the first three arguments right ;; in the call to SELECT-PLAN. (CAR NEW-PLANS) is the plan for ;; converting the part of SOURCE that begins with ;; SOURCE-ELEMENT into REST-OF-TARGET, which is the plan that ;; will be needed after an insertion step is performed. (CAR ;; REST-OF-OLD-PLANS) at this point is the plan for converting ;; the part of SOURCE that _follows_ SOURCE-ELEMENT into the ;; part of REST-OF-TARGET that follows its first element; this ;; is the plan that will be needed after a change step is ;; performed. Finally, (CADR REST-OF-OLD-PLANS) is the plan ;; for converting the part of SOURCE that follows ;; SOURCE-ELEMENT into REST-OF-TARGET; this is the plan that ;; will be needed after a deletion step is performed. ;; An element is trimmed from the beginning of REST-OF-TARGET ;; and also from REST-OF-OLD-PLANS on each iteration of LOOP ;; precisely in order to maintain the relative positions of ;; these needed plans. (else (loop (cons (select-plan (car new-plans) (car rest-of-old-plans) (cadr rest-of-old-plans) position source-element (car rest-of-target)) new-plans) (cdr rest-of-target) (cdr rest-of-old-plans))))))) ;; The SELECT-PLAN procedure compares three alternative ways of extending ;; previously constructed plans to determine which of them is the most ;; efficient. It takes six arguments: ;; * INSERTION-PLAN is the sequence of editing instructions to be followed ;; after an object has been inserted into the source list, to match a ;; given object in the target list. This insertion plan causes the part ;; of the source list that begins with the current element to match the ;; part of the target list that follows the inserted element. ;; * CHANGE-PLAN is the sequence of editing instructions to be followed ;; after an object from the source list has been replaced with an object ;; that matches a given object in the target list. This change plan ;; causes the part of the source list that follows the element that has ;; been replaced to match the part of the target list the follows the ;; object that replaces it. ;; * DELETION-PLAN is the sequence of editing instructions to be followed ;; after an object has been removed from the source list. This deletion ;; plan causes the part of the source list that follows the deleted ;; element to match the part of the target list that begins with the ;; given object in the target list. ;; * POSITION indicates the (zero-based) position in SOURCE of the element ;; currently being considered. ;; * OLD is that element of SOURCE. ;; * NEW is the current element of TARGET. ;; SELECT-PLAN works by comparing the lengths of the various plans and ;; selecting the shortest. In case of a tie, a change step is preferred to ;; a step of either of the other kinds. ;; Having determined which of the plans to build on, SELECT-PLAN ;; synthesizes the appropriate change, insertion, or deletion step and ;; prepends it to the selected plan, returning the result. (define select-plan (lambda (insertion-plan change-plan deletion-plan position old new) ;; Determine the lengths of the various plans. (let ((insertion-length (length insertion-plan)) (change-length (length change-plan)) (deletion-length (length deletion-plan))) ;; If the change plan is at least as short as either of the others, ;; use it. (cond ((and (<= change-length insertion-length) (<= change-length deletion-length)) (cons (list position (+ position 1) (list old) (list new)) change-plan)) ;; Otherwise, use whichever of the remaining plans is the ;; shorter. ((<= insertion-length deletion-length) (cons (list position position '() (list new)) insertion-plan)) (else (cons (list position (+ position 1) (list old) '()) deletion-plan)))))) ;; The LAST procedure finds and returns the last element of a non-empty ;; list. (define last (lambda (ls) ;; Check the preconditions. (if (not (list? ls)) (error 'last "The argument must be a list")) (if (null? ls) (error 'last "The argument must not be the empty list")) ;; Find the last element. (let loop ((rest ls)) (if (null? (cdr rest)) (car rest) (loop (cdr rest)))))) ;; The COMPRESS-PLAN procedure takes a list of editing instructions and ;; combines those than affect adjacent positions in the source list so as ;; to produce a shorter sequence that has exactly the same effect. (define compress-plan (lambda (plan) ;; Make sure that PLAN is indeed a list of editing instructions. (if (not (list? plan)) (error 'compress-plan "The argument must be a list")) (if (not (every? editing-instruction? plan)) (error 'compress-plan "Each element of the argument must be an editing instruction")) ;; It is impossible to compress an empty plan, so return it unchanged. (if (null? plan) '() ;; Otherwise, let STARTER initially be the first step of the plan, ;; REST the remaining steps, and RESULT the part of the compressed ;; plan that has so far been constructed. (let loop ((starter (car plan)) (rest (cdr plan)) (result '())) ;; If no more steps remain in the plan, add STARTER to the ;; compressed plan, reverse it to bring the steps into forwards ;; order, and return the result. (cond ((null? rest) (reverse (cons starter result))) ;; If the ending position of the STARTER step is the same ;; as the starting position of the next step in the ;; original plan, form a new STARTER step that replaces ;; both of them. The starting position will be that of ;; STARTER, the ending position that of the next step of ;; the original plan, and the lists of elements to be ;; deleted and to be inserted are formed by applying ;; APPEND to the corresponding components of both steps. ((= (cadr starter) (caar rest)) (loop (list (car starter) (cadar rest) (append (caddr starter) (caddar rest)) (append (cadddr starter) (cadddr (car rest)))) (cdr rest) result)) ;; If the ending position of STARTER is not equal to the ;; starting position of the next step, no compression can ;; be done; add STARTER to the compressed plan and begin a ;; new iteration with the first of the remaining steps as ;; the starter. (else (loop (car rest) (cdr rest) (cons starter result)))))))) ;; The EVERY? procedure takes as arguments a predicate and a list. It ;; determines whether every element of the list satisfies the predicate, ;; returning #T if they all do and #F if even one failure is discovered. (define every? (lambda (test? ls) ;; Make sure the TEST? is a procedure and LS a list. (if (not (procedure? test?)) (error 'every? "The first argument must be a predicate")) (if (not (list? ls)) (error 'every? "The second argument must be a list")) (let loop ((rest ls)) (or (null? rest) (and (test? (car rest)) (loop (cdr rest))))))) ;; The EDITING-INSTRUCTION? predicate takes one argument and determines ;; whether it has the correct structure to be an editing instruction. An ;; editing instruction must be a four-element list in which the first two ;; elements are non-negative integers, the first less than or equal to the ;; second, and the remaining two elements are lists, not both empty. (define editing-instruction? (lambda (obj) (and (list? obj) (= (length obj) 4) (integer? (car obj)) (not (negative? (car obj))) (integer? (cadr obj)) (<= (car obj) (cadr obj)) (list? (caddr obj)) (list? (cadddr obj)) (not (and (null? (caddr obj)) (null? (cadddr obj))))))) ;; The DISPLAY-READABLY procedure takes a sequence of editing instructions ;; and writes out a human-readable description of the differences indicated ;; by those instructions. Each line that would have to be removed from the ;; source file in the process of making it match the target file is ;; reprinted with a less-than sign in front of it; each line that would ;; have to be inserted is reprinted with a greater-than sign. Each group ;; of insertions and/or deletions affecting a different position in the ;; source file is preceded by a line describing the nature and position of ;; the editing operation -- `delete' if only deletions are proposed, ;; `insert' if only insertions are proposed, and `change' if both are ;; proposed. ;; To display the editing instructions ;; ((0 1 ("A") ()) ;; (3 3 () ("F")) ;; (5 6 ("F") ("G"))) ;; DISPLAY-READABLY would output ;; delete at 0 ;; < A ;; ;; insert at 3 ;; > F ;; ;; change at 5 ;; < F ;; > G ;; The procedure actually works by invoking a single-instruction version, ;; DISPLAY-ONE-STEP, to each step in the proposed sequence. (define display-readably (lambda (seq) (for-each display-one-step seq))) (define display-one-step (lambda (instruction) ;; Make sure the argument is an editing instruction. (if (not (editing-instruction? instruction)) (error 'display-one-step "The argument must be an editing instruction")) ;; Output the nature of the change. (cond ((null? (caddr instruction)) (display "insert")) ((null? (cadddr instruction)) (display "delete")) (else (display "change"))) ;; Output the position of the change. (display " at ") (display (car instruction)) (newline) ;; Output each line to be removed. (for-each (lambda (delend) (display "< ") (display delend) (newline)) (caddr instruction)) ;; Output each line to be inserted. (for-each (lambda (insertend) (display "> ") (display insertend) (newline)) (cadddr instruction)) ;; Leave a blank line after each step. (newline))) ;; To compare the files frogs.txt and amphibians.txt, load this file and ;; give the command (diff "frogs.txt" "amphibians.txt").