{ This program is a solution to exercise #5 for CSC 206, ``Fundamentals of
computer science II,'' offered at Grinnell College in fall semester,
1996.
It performs automatic indexing on a text that has been divided into
pages, recording the positions of marked words and phrases in the source
text and producing an alphabetized index of them.
Specifically, the text to be indexed is read from standard input; it is
assumed to begin with page 1, and each occurrence of the ASCII form-feed
character is assumed to mark the end of a page and possibly the beginning
of a new one. Each name, word, or phrase that is to be recorded in the
index must be marked in the input with the ASCII commerical-at character,
@. If the character immediately following the commercial-at is a
left-parenthesis, then the text of the index entry is the string of
characters between that left-parenthesis and the next following
right-parenthesis. If the character after the commercial-at is not a
left-parenthesis, then the index entry is the string of characters
between the commercial-at and the next following space character or end
of a line of text.
However, it is an error (1) for the text of an index entry to include a
commercial-at character, a left-parenthesis, or a form-feed character;
(2) for the source text to contain a commercial-at character followed by
a left-parenthesis, if there is no subsequent right-parenthesis; (3) for
the source text to contain a commercial-at character not followed by a
left-parenthesis if there is no subsequent space or end of line; or (4)
for the index entry to be the null string (i.e., for the commercial-at
character to be immediately followed by a space or the end of the line,
or for the commercial-at to be followed immediately by a left-parenthesis
and then an immediate right-parenthesis).
If the program encounters any of these errors, it produces only a list of
syntax errors in the source, giving the page and line number at which
each error is detected.
On the other hand, if the input is free from these errors, the program
produces an alphabetized list of the index entries it encounters, one per
line, each followed by a list of the page numbers on which the entry
occurs. The entries are alphabetized without regard to case
(specifically, as if all were entirely in upper case). Non-letter
characters in the index entries are arranged according to their ASCII
values.
Programmer: John Stone, Grinnell College.
Date of this version: November 17-24, 1996.
}
$heap_dispose on$
$assert_halt on$
program Indexer (Input, Output, StdErr);
$search 'strings.o'$
import
Strings;
const
{ A code number is provided for every kind of internal programming error
that the program checks for. }
FirstExceptionCode = 1;
CreateSingletonPageNumberListException = 1;
UndefinedPageNumberListException = 2;
NilTailInPageNumberListException = 3;
AppendToPageNumberListException = 4;
InvalidPageNumberListArgumentException = 5;
ExceptionException = 6;
LastExceptionCode = 6;
Debug = True;
{ True during debugging, False (for greater speed) when the program
is released }
type
PageNumber = 1 .. MaxInt;
{ A page number must be a positive integer. }
PageNumberLink = ^PageNumberComponent;
PageNumberComponent = record
Datum: PageNumber;
Next: PageNumberLink
end;
PageNumberList = record
Head: PageNumberLink;
Tail: PageNumberLink
end;
{ The numbers of the pages on which a given index entry appears are
maintained in a list -- not, strictly speaking, a queue, because
we need the operation of peeking at the last element added. }
Entry = record
EntryText: String;
Locations: PageNumberList
end;
{ A component of the index consists of the text of the entry (a string
of characters) and a list of the page numbers on which it occurs. }
EntryLink = ^EntryComponent;
EntryComponent = record
Datum: Entry;
Next: EntryLink
end;
EntryList = EntryLink;
{ The complete list of index entries is kept as a singly-linked list,
maintained at all times in alphabetical order (case-insensitive, as
prescribed by the problem specification. }
var
Index: EntryList;
{ a list containing all of the index entries, with the page numbers on
which each one occurs }
ErrorFound: Boolean;
{ indicates whether any syntax errors are encountered during the
construction of the index }
{ The IndexerExceptionHandler reports an internal programming error and
halts the program. All such errors are of the ``this can't happen''
sort -- the programmer believes that this procedure will never be
invoked. }
procedure IndexerExceptionHandler (ExceptionCode: Integer);
begin
if (ExceptionCode < FirstExceptionCode) or
(LastExceptionCode < ExceptionCode) then
ExceptionCode := ExceptionException;
WriteLn (StdErr, 'Exception #', ExceptionCode : 1,
' in program Indexer:');
case ExceptionCode of
CreateSingletonPageNumberListException:
WriteLn (StdErr, 'An incorrectly constructed PageNumberList was ',
'returned by the CreateSingletonPageNumberList function.');
UndefinedPageNumberListException:
WriteLn (StdErr, 'An undefined PageNumberList variable was given ',
'as argument to a procedure.');
NilTailInPageNumberListException:
WriteLn (StdErr, 'A nil pointer was encountered in the Tail field ',
'of a PageNumberList.');
AppendToPageNumberListException:
WriteLn (StdErr, 'An incorrectly constructed PageNumberList was ',
'returned by the AppendToPageNumberList procedure.');
InvalidPageNumberListArgumentException:
WriteLn (StdErr, 'An incorrectly constructed PageNumberList was ',
'provided as an argument to a procedure.');
ExceptionException:
WriteLn (StdErr, 'The IndexerExceptionHandler procedure received ',
'an unknown exception code.')
end
end;
{ The next few procedures and functions define the operations on the
PageNumberList type. In this program, there's never a need for an
empty list of page numbers, so a page number list is created with
its first element already in place, and this element is never
removed; consequently, many of the procedures have the precondition
that the list is not empty. }
{ The ValidPageNumberList function determines whether a PageNumberList
is correctly allocated and linked. }
function ValidPageNumberList (Given: PageNumberList): Boolean;
var
Traverser: PageNumberLink;
{ points successively to each component of the linked list in the
supposed PageNumberList structure }
begin
if Given.Head = nil then
ValidPageNumberList := False
else begin
Traverser := Given.Head;
while Traverser^.Next <> nil do
Traverser := Traverser^.Next;
ValidPageNumberList := (Traverser = Given.Tail)
end
end;
{ The CreateSingletonPageNumberList function constructs and returns a
list of page numbers containing only one page number -- the one
specified by the argument. }
function CreateSingletonPageNumberList (NewPage: PageNumber):
PageNumberList;
var
Result: PageNumberList;
{ the page number list to be returned, as it is being constructed }
begin
New (Result.Head);
Result.Head^.Datum := NewPage;
Result.Head^.Next := nil;
Result.Tail := Result.Head;
if Debug then
Assert (ValidPageNumberList (Result),
CreateSingletonPageNumberListException,
IndexerExceptionHandler);
CreateSingletonPageNumberList := Result
end;
{ The AppendToPageNumberList procedure extends a given page number list
to include an additional page number at the end, provided that that
page number is not already at the end of the list. (If it is, the page
number list is left unchanged. }
procedure AppendToPageNumberList (var Locations: PageNumberList;
NewPage: PageNumber);
var
Appendix: PageNumberLink;
{ a pointer to a newly allocated component, to be attached at the
end of the list }
begin
Assert (Locations.Tail <> nil, NilTailInPageNumberListException,
IndexerExceptionHandler);
if Locations.Tail^.Datum <> NewPage then begin
New (Appendix);
Appendix^.Datum := NewPage;
Appendix^.Next := nil;
Locations.Tail^.Next := Appendix;
Locations.Tail := Appendix
end;
if Debug then
Assert (ValidPageNumberList (Locations),
AppendToPageNumberListException,
IndexerExceptionHandler);
end;
{ The WritePageNumberList procedure writes, to a specified output file,
all of the page numbers in the list, with a comma and a space before
each one except the first. }
procedure WritePageNumberList (var Target: Text;
Scribend: PageNumberList);
const
Separator = ', ';
{ the string to be used to separate page numbers in the printed
list }
var
Traverser: PageNumberLink;
{ points to each successive component of Scribend }
begin
Assert (ValidPageNumberList (Scribend),
InvalidPageNumberListArgumentException,
IndexerExceptionHandler);
Traverser := Scribend.Head;
Write (Target, Traverser^.Datum : 1);
Traverser := Traverser^.Next;
while Traverser <> nil do begin
Write (Target, Separator, Traverser^.Datum : 1);
Traverser := Traverser^.Next
end
end;
{ The DeallocatePageNumberList procedure recycles all the dynamically
allocated storage associated with a PageNumberList. }
procedure DeallocatePageNumberList (var Delend: PageNumberList);
var
Traverser: PageNumberLink;
{ points successively to each component of the PageNumberList to be
deleted }
Temporary: PageNumberLink;
{ a pointer to a component that is about to be deleted, immediately
after Traverser has moved on }
begin
Assert (ValidPageNumberList (Delend),
InvalidPageNumberListArgumentException,
IndexerExceptionHandler);
Traverser := Delend.Head;
while Traverser <> nil do begin
Temporary := Traverser;
Traverser := Traverser^.Next;
Dispose (Temporary)
end;
Delend.Head := nil;
Delend.Tail := nil
end;
{ The next few procedures and functions deal with lists of entries. }
{ The CreateEntryList procedure constructs and returns an empty
EntryList. }
function CreateEntryList: EntryList;
begin
CreateEntryList := nil
end;
{ The AddToEntryList procedure looks up a given string in a given list of
entries. If the string is already present, a given page number is
added to the list of page numbers in the entry for that string; if not,
a new entry is constructed and inserted at the appropriate point, with
the given page number as the only item in its list of page numbers. }
procedure AddToEntryList (NewEntryText: String; NewPage: PageNumber;
var Index: EntryList);
var
Traverser, Trailer: EntryLink;
{ pointers to successive components of Index; Trailer consistently
points to the component preceding the one Traverser points to }
Continue: Boolean;
{ indicates whether the search for the given string can and should
continue }
{ The BuildNewEntry procedure constructs a new component at a specified
site, which is presumed to be inside an EntryList, and links it
to the following element of that EntryList, if any. }
procedure BuildNewEntry (var Site: EntryLink; NewEntryText: String;
NewPage: PageNumber);
var
Successor: EntryLink;
{ temporary storage for a pointer to the component (if any) at the
insertion site, which will become the successor of the inserted
component }
begin
Successor := Site;
New (Site);
Site^.Datum.EntryText := NewEntryText;
Site^.Datum.Locations := CreateSingletonPageNumberList (NewPage);
Site^.Next := Successor
end;
begin { procedure AddToEntryList }
Traverser := Index;
Continue := True;
while Continue do
{ If the end of the list of entries is reached, add a new entry
at the end. }
if Traverser = nil then begin
if Traverser = Index then
BuildNewEntry (Index, NewEntryText, NewPage)
else
BuildNewEntry (Trailer^.Next, NewEntryText, NewPage);
Continue := False
end
{ If Traverser is pointing to a string that is alphabetically
prior to the given string, advance Traverser and Trailer (and
continue the search). }
else if CaseInsensitivePrecedesString (Traverser^.Datum.EntryText,
NewEntryText) then begin
Trailer := Traverser;
Traverser := Traverser^.Next;
end
{ If Traverser is pointing to a string that is alphabetically
posterior to the given string, stop and insert a new entry;
the given string is not in the list of entries and the correct
insertion point has been reached. }
else if CaseInsensitivePrecedesString (NewEntryText,
Traverser^.Datum.EntryText) then begin
if Traverser = Index then
BuildNewEntry (Index, NewEntryText, NewPage)
else
BuildNewEntry (Trailer^.Next, NewEntryText, NewPage);
Continue := False
end
{ Otherwise, Traverser is pointing to a string that matches the
given string; add the new page number to the list of page numbers
in the entry we've found. The new copy of the string can be
discarded, since it duplicates the string that already exists in
the entry. }
else begin
DeallocateString (NewEntryText);
AppendToPageNumberList (Traverser^.Datum.Locations, NewPage);
Continue := False
end
end;
{ The DeallocateEntryList procedure recycles all of the storage
associated with a given EntryList, including the storage for the string
and the list of page numbers inside the components of the EntryList. }
procedure DeallocateEntryList (var Delend: EntryList);
var
Traverser: EntryLink;
{ points successively to each component of the EntryList to be
deleted }
Temporary: EntryLink;
{ a pointer to a component that is about to be deleted, immediately
after Traverser has moved on }
begin
Traverser := Delend;
while Traverser <> nil do begin
DeallocateString (Traverser^.Datum.EntryText);
DeallocatePageNumberList (Traverser^.Datum.Locations);
Temporary := Traverser;
Traverser := Traverser^.Next;
Dispose (Temporary)
end;
Delend := nil
end;
{ The CompileIndex procedure builds and returns a complete list of the
marked strings in the input text, with their page numbers, unless a
syntax error is encountered in the input, in which case it prints out
the location of each syntax error it detects (but does not return an
EntryList structure). The ErrorFound argument indicates whether any
syntax errors were detected. }
procedure CompileIndex (var Index: EntryList; var ErrorFound: Boolean);
const
EntryMarker = '@';
{ the character that signals the beginning of an entry }
type
LineNumber = 1 .. MaxInt;
{ a positive integer to indicate which line of the current page the
next character to be read notionally appears on }
var
CurrentPage: PageNumber;
{ the number of the page currently being read }
CurrentLine: LineNumber;
{ the number of the line currently being read }
CurrentEntryText: String;
{ the text of an item to be indexed }
Success: Boolean;
{ indicates whether the text of an index entry was collected without
an error }
{ The FormFeed function constructs and returns the form-feed
character. }
function FormFeed: Char;
begin
FormFeed := Chr (12);
end;
{ The ReadEntry procedure attempts to collect the text of an entry
from standard input. If it is successful, the Success parameter is
set to True and the text of the entry is returned through
CurrentEntryText; otherwise, the Success parameter is set to False.
The value of the CurrentLine parameter is incremented each time a
line break is consumed from the input. }
procedure ReadEntry (var CurrentEntryText: String;
var CurrentLine: LineNumber; var Success: Boolean);
const
Starter = '(';
{ a marker indicating that the text of an entry may contain
several words }
Stopper = ')';
{ a marker terminating the text of an entry that may contain
several words }
Space = ' ';
{ a more legible representation of the space character }
var
Done: Boolean;
{ indicates whether the search for the end of the text of the
current index entry can and should continue }
Temporary: String;
{ temporary storage for a string to which a new character has been
added }
begin
if EOF then
{ An entry must contain at least one character. }
Success := False
else if Input^ = Starter then begin
{ The beginning of an entry that may contain several words has
been found. Discard the Starter marker. }
Get (Input);
{ Collect the text of the index entry; it ends just before the
first occurrence of Stopper. }
CurrentEntryText := NullString;
Done := False;
while not Done do
{ It is an error for the file to end before Stopper has been
encountered. }
if EOF then begin
Success := False;
Done := True
end
{ A line break in an index entry is replaced with a space
character. }
else if EOLn then begin
Temporary := AppendToString (CurrentEntryText, Space);
DeallocateString (CurrentEntryText);
CurrentEntryText := Temporary;
ReadLn;
CurrentLine := CurrentLine + 1
end
{ It is an error for the EntryMarker, a page break, or another
occurrence of the Starter character to appear inside an index
entry. }
else if Input^ in [EntryMarker, FormFeed, Starter] then begin
Success := False;
Done := True
end
{ When the stopper is reached, discard it and stop collecting
the text of the entry. It is an error for the text to be the
null string. }
else if Input^ = Stopper then begin
Get (Input);
Success := not EmptyString (CurrentEntryText);
Done := True
end
{ In any other case, just add the next character to the text
of the index entry. }
else begin
Temporary := AppendToString (CurrentEntryText, Input^);
DeallocateString (CurrentEntryText);
CurrentEntryText := Temporary;
Get (Input)
end;
{ Discard the string if a syntax error was detected. }
if not Success then
DeallocateString (CurrentEntryText)
end
else begin
{ Collect the text of the index entry; it ends just before the next
line break or space. }
CurrentEntryText := NullString;
Done := False;
while not Done do
{ It is an error for the file to end before an appropriate
terminator has been encountered. }
if EOF then begin
Success := False;
Done := True
end
{ A line break or a space terminates the text of the entry. }
else if EOLn or (Input^ = Space) then begin
Success := not EmptyString (CurrentEntryText);
Done := True
end
{ It is an error for the EntryMarker, a page break, or the
Starter character to appear inside an index entry. }
else if Input^ in [EntryMarker, FormFeed, Starter] then begin
Success := False;
Done := True
end
{ In any other case, just add the next character to the text
of the index entry. }
else begin
Temporary := AppendToString (CurrentEntryText, Input^);
DeallocateString (CurrentEntryText);
CurrentEntryText := Temporary;
Get (Input)
end;
{ Discard the string if a syntax error was detected. }
if not Success then
DeallocateString (CurrentEntryText)
end
end;
{ The ReportError procedure reports the location of a syntax error by
writing it to the standard error output. }
procedure ReportError (CurrentPage: PageNumber;
CurrentLine: LineNumber);
begin
WriteLn (StdErr, 'A syntax error was encountered on line ',
CurrentLine : 1, ' of page ', CurrentPage : 1,
' of the input.')
end;
begin { procedure CompileIndex }
Index := CreateEntryList;
ErrorFound := False;
CurrentPage := 1;
CurrentLine := 1;
while not EOF do
if EOLn then begin { start a new line }
ReadLn;
CurrentLine := CurrentLine + 1
end
else if Input^ = FormFeed then begin { start a new page }
Get (Input);
CurrentPage := CurrentPage + 1;
CurrentLine := 1
end
else if Input^ = EntryMarker then begin { collect an entry }
Get (Input);
ReadEntry (CurrentEntryText, CurrentLine, Success);
if not Success then begin
ReportError (CurrentPage, CurrentLine);
if not ErrorFound then begin
ErrorFound := True;
DeallocateEntryList (Index)
end
end
else if ErrorFound then
DeallocateString (CurrentEntryText)
else
AddToEntryList (CurrentEntryText, CurrentPage, Index)
end
else
Get (Input)
end;
{ The PrintIndex procedure writes out the alphabetical list of entries,
each followed by a list of the pages on which it occurs. }
procedure PrintIndex (Index: EntryList);
const
Space = ' ';
{ a more legible representation of the space character }
var
Traverser: EntryLink;
{ points successively to each component of Index }
begin
Traverser := Index;
while Traverser <> nil do begin
WriteString (Output, Traverser^.Datum.EntryText);
Write (Space);
WritePageNumberList (Output, Traverser^.Datum.Locations);
WriteLn;
Traverser := Traverser^.Next
end
end;
begin { main program }
CompileIndex (Index, ErrorFound);
if not ErrorFound then begin
PrintIndex (Index);
DeallocateEntryList (Index)
end
end.
created December 2, 1996
last revised December 2, 1996
John David Stone
(stone@math.grin.edu)