Solution to exercise #4

{ This program is a solution to exercise #4 for CSC 206, ``Fundamentals of
  computer science II,'' offered at Grinnell College in fall semester,
  1996.

  The program examines and compares the career batting statistics of a
  number of baseball players in order to determine which of them are most
  similar to one another, in a precisely quantified sense.  It reads the
  statistics from a specified text file, /u2/stone/datasets/hitters.dat,
  and writes its findings to standard output, in the form of an
  alphabetized list of players in which each player's name is followed
  by a list of other players that are found to be most similar to that
  player.

  Here are the main steps of the process:

  1. Read each player's batting statistics into one record within an array;
     if an error is encountered in the entry for a player in the input
     file, report it to the user and discard the entry for that player.

  2. Arrange the elements of the array in alphabetical order by player's
     name.

  3. For each player P in the array:

     a. Compare P's statistics with that of every other player Q,
        calculating a ``disparity score'' quantifying the difference
        between P and Q.  Keep track of the other players that have the
        least disparity scores when compared to P -- the five most similar
        players, if there are five or more other players, or else as many
        other players as there are.

     b. Write out P's name and the names of these most similar players,
        with the disparity score for each.

  Programmer: John Stone, Grinnell College.
  Date of this version: October 9-13, 1996.
}

program SimilarHitters (HitterData, Output, StdErr);

const
  NameLength = 22;
    { According to the specification, each player's name fits into a field
      containing this many characters. }
  MaximumNumberOfPlayers = 130;
    { the largest number of players that this program can accommodate; the
      specification says that the data base is supposed to contain
      information about this many players }
  MaximumSimilarPlayers = 5;
    { the largest number of players to be identified as ``most similar'' to
      a given player }
  HitterDataFileName = '/u2/stone/datasets/hitters.dat';
    { the operating system's name for the file containing the batting
      statistics for various players }

type
  NameString = packed array [1 .. NameLength] of Char;
    { a standard Pascal string that can accommodate any player's name }
  Natural = 0 .. MaxInt;
    { non-negative integers }
  Player = record
             Name: NameString;
             AtBats: Natural;
             Hits: Natural;
             Doubles: Natural;
             Triples: Natural;
             HomeRuns: Natural;
             RunsBattedIn: Natural;
             BattingAverage: Natural;     { in thousandths }
             SluggingPercentage: Natural; { in thousandths }
             OnBasePercentage: Natural    { in thousandths }
           end;
    { name and statistics about one player; the batting average, slugging
      percentage, and on-base percentage are scaled up by a factor of one
      thousand so that integer arithmetic can be used in computing the
      disparity scores }
  PlayerArray = array [1 .. MaximumNumberOfPlayers] of Player;
    { batting statistics about all of the players }
  PlayerTable = record
                  Data: PlayerArray;
                  Size: Natural
                end;
    { an array of player entries, together with an indication of how much
      of that array is actually occupied by valid entries }
  DisparityRecord = record
                      Name: NameString;
                      Disparity: Natural
                    end;
    { a player's name, together with the computed measure of the player's
      difference from another player }
  DisparityArray = array [1 .. MaximumSimilarPlayers] of DisparityRecord;
    { an array of player names and disparity scores }
  DisparityTable = record
                     Data: DisparityArray;
                     Size: Natural
                   end;
    { an array of player names and disparity scores, together with an
      indication of how much of that array is occupied }

var
  HitterData: Text;
    { the file containing the batting statistics for various players }
  Players: PlayerTable;
    { one player record for each entry in the source file }
  PlayerNumber: Natural;
    { counts off the player records in the array }
  SimilarPlayers: DisparityTable;
    { the table of players most similar to a given player }

  { The ReadStatistics procedure collects the batting statistics for all
    the players from the input file. }

  procedure ReadStatistics (var HitterData: Text;
    var Players: PlayerTable);
  var
    LinesRead: Natural;
      { tallies the lines of the source file as they are read in }
    Success: Boolean;
      { indicates whether the attempt to read in statistics about one
        player from one line of the file was successful }

    { The ReadPlayer procedure attempts to collect batting statistics for
      one player from the source file, returning it through the variable
      parameter P.  If a syntax error is detected in the source file, the
      Success parameter is set to False; otherwise, it is set to True.

      The correct format for the batting statistics about one player is
      as follows:

      Columns 1 through 22 contain the player's name, left-justified.

      Column 23 is blank.

      Column 24 through 28 contain the number of times the player batted,
      right-justified.

      Column 29 is blank.

      Columns 30 through 33 contain the number of hits the player made,
      right-justified.

      Column 34 is blank.

      Columns 35 through 37 contain the number of doubles the player made,
      right-justified.

      Column 38 is blank.

      Columns 39 through 41 contain the number of triples the player made,
      right-justified.

      Column 42 is blank.

      Columns 43 through 45 contain the number of home runs the player made,
      right-justified.

      Column 46 is blank.

      Columns 47 through 50 contain the number of runs the player batted in,
      right-justified.

      Column 51 is blank.

      Columns 52 through 55 contain the player's career batting average,
      written as a decimal point followed by three digits.

      Column 56 is blank.

      Columns 57 through 60 contain the player's career slugging percentage,
      written as a decimal point followed by three digits.

      Column 61 is blank.

      Columns 62 through 65 contain the player's career on-base percentage,
      written as a decimal point followed by three digits.

      The procedure enforces this format ruthlessly. }

    procedure ReadPlayer (var HitterData: Text; var P: Player; 
      var Success: Boolean);
    label 99;
      { an emergency exit from the procedure, in case a syntax error is
        detected in the source file }
    const
      Space = ' ';
      DecimalPoint = '.';
        { formatting characters in the source file }
      AtBatsWidth = 5;
        { the number of columns occupied by the number of a player's
          at-bats }
      HitsWidth = 4;
        { the number of columns occupied by the number of a player's
          hits }
      DoublesWidth = 3;
        { the number of columns occupied by the number of a player's
          doubles }
      TriplesWidth = 3;
        { the number of columns occupied by the number of a player's
          triples }
      HomeRunsWidth = 3;
        { the number of columns occupied by the number of a player's
          home runs }
      RunsBattedInWidth = 4;
        { the number of columns occupied by the number of a player's
          runs batted in }
      BattingAverageWidth = 3;
        { the number of columns occupied by the fractional part of a
          player's batting average }
      SluggingPercentageWidth = 3;
        { the number of columns occupied by the fractional part of a
          player's slugging percentage }
      OnBasePercentageWidth = 3;
        { the number of columns occupied by the fractional part of a
          player's on-base percentage }

      { The ReadName procedure tries to read in, from a specified source
        file, exactly NameStringLength characters, without encountering
        either the end of a line or the end of the file.  If it succeeds,
        the characters are stored in the parameter Legend and the parameter
        Success is set to True; otherwise, Success is set to False and the
        contents of Legend are undefined. }

      procedure ReadName (var Source: Text; var Legend: NameString;
        var Success: Boolean);
      var
        Position: Natural;
          { counts off characters as they are inserted into the NameString }
      begin
        Position := 0;
        Success := True;
        while Success and (Position < NameLength) do
          if EOF (Source) then
            Success := False
          else if EOLn (Source) then
            Success := False
          else begin
            Position := Position + 1;
            Read (Source, Legend[Position])
          end
      end;

      { The Match procedure tries to read in a specified character from a
        specified source file.  It indicates whether it has succeeded by
        setting the parameter Success.  If the character is read, it is
        discarded. }

      procedure Match (var Source: Text; Sought: Char;
        var Success: Boolean); 
      begin
        if EOF (Source) then
          Success := False
        else if EOLn (Source) then
          Success := False
        else if Source^ = Sought then begin
          Success := True;
          Get (Source)
        end
        else
          Success := False
      end;

      { The ReadFixedWidthNatural tries to read in a natural number, which
        must be right-justified in a field of specified width that is
        otherwise occupied by spaces.  If it succeeds, it stores the
        natural number in the Legend parameter and sets the Success
        parameter to True; otherwise, it sets Success to False and the
        contents of Legend are undefined.

        The procedure can fail for any of several reasons:

        * The end of the input file is encountered.
        * The end of an input line is encountered.
        * The value of the numeral being read exceeds MaxInt.
        * A character that is neither a space nor a digit is encountered.
        * A space is encountered after the numeral has begun.

        The procedure will stop as soon as any of these conditions is
        detected, without consuming any erroneous character. }

      procedure ReadFixedWidthNatural (var Source: Text; Width: Natural;
        var Legend: Natural; var Success: Boolean);
      var
        Position: Natural;
          { counts off characters as they are read in }
        DigitEncountered: Boolean;
          { indicates whether any digit characters have so far been
            encountered in the input (if so, no more spaces should be
            seen) }
        Digit: Natural;
          { the numeric value of the next character of the source file,
            known to be a digit character }

        { The IsDigit function determines whether the character it is given
          is a digit character. }

        function IsDigit (Ch: Char): Boolean;
        begin
          if (Ch < '0') then
            IsDigit := False
          else
            IsDigit := (Ch <= '9')
         end;

         { The DigitValue function takes a character that has been
           determined to be a digit and returns its numerical value. }

         function DigitValue (Ch: Char): Natural;
         begin
           DigitValue := Ord (Ch) - Ord ('0')
         end;

         { The CanBeExtended function determines whether the natural number
           that would result from an attempt to add an extra digit to a
           given natural number exceeds MaxInt.  It returns True if the
           resulting number would not exceed MaxInt and so would still fit
           in the Natural type defined above; it returns False if the
           computation would cause an overflow. }

         function CanBeExtended (Foundation: Natural; Extension: Natural):
           Boolean;
         begin
           if Foundation < MaxInt div 10 then
             CanBeExtended := True
           else if MaxInt div 10 < Foundation then
             CanBeExtended := False
           else
             CanBeExtended := (Extension <= MaxInt mod 10)
         end;

      begin { procedure ReadFixedWidthNatural }
        Position := 0;
        DigitEncountered := False;
        Legend := 0;
        Success := True;
        while (Position < Width) and Success do
          if EOF (Source) then
            Success := False
          else if EOLn (Source) then
            Success := False
          else if IsDigit (Source^) then begin
            DigitEncountered := True;
            Digit := DigitValue (Source^);
            if CanBeExtended (Legend, Digit) then begin
              Legend := Legend * 10 + Digit;
              Get (Source);
              Position := Position + 1
            end
            else
              Success := False
          end
          else if Source^ <> Space then
            Success := False
          else if DigitEncountered then
            Success := False
          else begin
            Get (Source);
            Position := Position + 1
          end;
        if not DigitEncountered then
          Success := False
      end;

    begin { procedure ReadPlayer }
      ReadName (HitterData, P.Name, Success);
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, AtBatsWidth, P.AtBats, Success);
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, HitsWidth, P.Hits, Success);
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, DoublesWidth, P.Doubles, Success);
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, TriplesWidth, P.Triples, Success);
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, HomeRunsWidth, P.HomeRuns,
                             Success);
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, RunsBattedInWidth, P.RunsBattedIn,
                             Success); 
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      Match (HitterData, DecimalPoint, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, BattingAverageWidth,
                             P.BattingAverage, Success); 
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      Match (HitterData, DecimalPoint, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, SluggingPercentageWidth,
                             P.SluggingPercentage, Success); 
      if not Success then
        goto 99;
      Match (HitterData, Space, Success);
      if not Success then
        goto 99;
      Match (HitterData, DecimalPoint, Success);
      if not Success then
        goto 99;
      ReadFixedWidthNatural (HitterData, OnBasePercentageWidth,
                             P.OnBasePercentage, Success); 
      if not Success then
        goto 99;
      if not EOLn (HitterData) then
        Success := False;
    99:
    end;

  begin { procedure ReadStatistics }
    Players.Size := 0;
    LinesRead := 0;
    while not EOF (HitterData) and
                        (Players.Size < MaximumNumberOfPlayers) do begin
      ReadPlayer (HitterData, Players.Data[Players.Size + 1], Success);
      ReadLn (HitterData);
      LinesRead := LinesRead + 1;
      if Success then
        Players.Size := Players.Size + 1
      else
        WriteLn (StdErr, 'Line #', LinesRead : 1, ' of the source file ',
                 'contained incorrectly formatted data and was discarded.')
    end;
    if not EOF (HitterData) then begin
      WriteLn (StdErr, 'The source file contained additional data after ',
               'the first ', MaximumNumberOfPlayers : 1, ' valid entries.');
      WriteLn (StdErr, 'These additional data were discarded.')
    end
  end;

  { The Alphabetize procedure rearranges the player records in a
    PlayerTable so that they are in alphabetical order by player name. }

  procedure Alphabetize (var Players: PlayerTable);

    { The Partition procedure runs through the elements in a specified
      segment of an array, collecting those that precede a specified pivot
      at the low-subscript end of the segment and shifting the rest to the
      high-subscript end.  The 'divider' parameter keeps track of the
      position of the last element in the low-end partition;  if there are  
      no elements in that partition, its value is set to one less than the
      lower boundary of the entire array segment. }

    procedure Partition (var Arr: PlayerArray; Start, Finish: Integer;
      Pivot: Player; var Divider: Natural);
    var
      Position: Natural;
        { counts off the positions in the array segment, from Start to
          Finish }
      Temporary: Player;
        { temporary storage for a player record being moved from one
          position to another }
    begin
      Divider := Start - 1;
      for Position := Start to Finish do
        if Arr[Position].Name < Pivot.Name then begin
          Divider := Divider + 1;
          Temporary := Arr[Position];
          Arr[Position] := Arr[Divider];
          Arr[Divider] := Temporary
        end
    end;

    { The Sort procedure rearranges the player records in the segment of
      an array lying between the (inclusive) boundaries specified by its
      Start and Finish parameters so that they are in alphabetical order
      by player name. }

    procedure Sort (var Arr: PlayerArray; Start, Finish: Natural);
    var
      Divider: Natural;
        { the highest-numbered position occupied by an element that
          precedes Arr[Start]; if there is no such position, Divider =
          Start } 
      Temporary: Player;
        { temporary storage for an element being moved from one position to
          another }
    begin
      if Start < Finish then begin
        Partition (Arr, Start + 1, Finish, Arr[Start], Divider);
        Temporary := Arr[Start];
        Arr[Start] := Arr[Divider];
        Arr[Divider] := Temporary;
        Sort (Arr, Start, Divider - 1);
        Sort (Arr, Divider + 1, Finish)
      end
    end;

  begin { procedure Alphabetize }
    Sort (Players.Data, 1, Players.Size)
  end;

  { The FindSimilarPlayers procedure compares one particular player, the
    one whose position in the Players table is given by the PlayerNumber
    parameter, with each of the other players, determines a disparity
    score for each of them, and keeps track of the names and disparity
    scores of each of the other players that are most similar to the
    one in position PlayerNumber. }

  procedure FindSimilarPlayers (Players: PlayerTable;
    PlayerNumber: Natural; var SimilarPlayers: DisparityTable);
  var
    OtherPlayerNumber: Natural;
      { the position of the other player in the Players table }

    { The InsertOrDiscard procedure determines whether a given player's
      disparity score is low enough to be placed in the table of ``most
      similar'' players and, if so, correctly positions that player's
      name and disparity score in the table.

      Here's the plan: Initially, the given player is assigned a notional
      position in the table, one greater than its current size.  Then the
      player's disparity score is compared with those of players already
      in the table, starting with the highest such score.  If the score
      of a player in the table is equal to or greater than the score of
      the newly arriving player, the comparisons stop; if the new player's
      score is less, however, the old player's score is shifted down one
      position in the table (or discarded, if it is already in the last
      real position in the table) and the next comparison is made with at
      the next lower position in the table.  The process continues until
      either it is halted in this way or all of the positions in the table
      have been examined and all the old players shifted down.

      At that point, if the new player has reached a position that is
      actually in the table, his data are inserted into the table at that
      position.  If the new player remains in the original notional
      position, he is added to the table only if it was not previously
      full.  The size of the table increases when the new player is added
      if it was not previously full. }

    procedure InsertOrDiscard (PlayerName: NameString; Score: Natural;
                               var SimilarPlayers: DisparityTable);
    var
      Position: Natural;
        { the notional position initially occupied by the new player }
      Continue: Boolean;
        { indicates whether the comparisons can and should continue }
    begin
      with SimilarPlayers do begin
        Position := Size + 1;
        Continue := (1 < Position);
        while Continue do
          if Score < Data[Position - 1].Disparity then begin
            if Position <= MaximumSimilarPlayers then
              Data[Position] := Data[Position - 1];
            Position := Position - 1;
            Continue := (1 < Position)
          end
          else
            Continue := False;
        if Position <= MaximumSimilarPlayers then begin
          Data[Position].Name := PlayerName;
          Data[Position].Disparity := Score;
          if Size < MaximumSimilarPlayers then
            Size := Size + 1
        end
      end
    end;

    { The DisparityScore function computes and returns a quantitative
      estimate of the dissimilarity of two players, based on their batting
      statistics. }

    function DisparityScore (Primero, Segundo: Player): Natural;
    const
      AtBatWeight = 1;
      HitWeight = 3;
      DoubleWeight = 10;
      TripleWeight = 15;
      HomeRunWeight = 15;
      RunsBattedInWeight = 6;
      BattingAverageWeight = 12;
      SluggingPercentageWeight = 8;
      OnBasePercentageWeight = 9;
        { multipliers for the differences between corresponding performance
          totals for the two players }
    begin
      DisparityScore :=
        AtBatWeight * Abs (Primero.AtBats - Segundo.AtBats) +
        HitWeight * Abs (Primero.Hits - Segundo.Hits) +
        DoubleWeight * Abs (Primero.Doubles - Segundo.Doubles) +
        TripleWeight * Abs (Primero.Triples - Segundo.Triples) +
        HomeRunWeight * Abs (Primero.HomeRuns - Segundo.HomeRuns) +
        RunsBattedInWeight * Abs (Primero.RunsBattedIn -
                                                Segundo.RunsBattedIn) + 
        BattingAverageWeight * Abs (Primero.BattingAverage -
                                                Segundo.BattingAverage) + 
        SluggingPercentageWeight * Abs (Primero.SluggingPercentage -
                                                Segundo.SluggingPercentage) + 
        OnBasePercentageWeight * Abs (Primero.OnBasePercentage -
                                                Segundo.OnBasePercentage)
    end;

  begin { procedure FindSimilarPlayers }
    SimilarPlayers.Size := 0;
    for OtherPlayerNumber := 1 to Players.Size do
      if OtherPlayerNumber <> PlayerNumber then
        InsertOrDiscard (Players.Data[OtherPlayerNumber].Name,
                         DisparityScore (Players.Data[PlayerNumber],
                                         Players.Data[OtherPlayerNumber]),
                         SimilarPlayers)
  end;

  { The DisplaySimilarPlayers procedure prints out the name of a given
    player and the names and disparity scores of the other players most
    similar to him. }

  procedure DisplaySimilarPlayers (Given: Player;
    SimilarPlayers: DisparityTable);
  var
    OtherPlayerNumber: Natural;
      { the position of one of the similar players in the SimilarPlayers
        table }

    { The length function computes and returns the length of a string,
      not including any trailing spaces. }

    function Length (S: NameString): Natural;
    const
      Space = ' ';
        { a legible name for the space character }
    var
      Position: Natural;
        { counts down through the positions in the string }
      Continue: Boolean;
        { indicates whether the search for a non-space character at the end
          of the string can and should continue }
    begin
      Position := NameLength;
      Continue := True;
      while Continue do
        if Position = 0 then
          Continue := False
        else if S[Position] = Space then
          Position := Position - 1
        else
          Continue := False;
      Length := Position
    end;

  begin { procedure DisplaySimilarPlayers }
    WriteLn ('Most similar to ', Given.Name : Length (Given.Name), ':');
    with SimilarPlayers do
      for OtherPlayerNumber := 1 to Size do
        with Data[OtherPlayerNumber] do
          WriteLn ('  ', Name : Length (Name), ' (disparity ',
                   Disparity : 1, ')');
    WriteLn
  end;

begin { main program }
  Reset (HitterData, HitterDataFileName);
  ReadStatistics (HitterData, Players);
  Alphabetize (Players);
  for PlayerNumber := 1 to Players.Size do begin
    FindSimilarPlayers (Players, PlayerNumber, SimilarPlayers);
    DisplaySimilarPlayers (Players.Data[PlayerNumber], SimilarPlayers)
  end
end.

created October 28, 1996
last revised October 28, 1996

John David Stone (stone@math.grin.edu)