package pascal;

import rebelsky.compiler.lexer.CharToken;
import rebelsky.compiler.lexer.Identifier;
import rebelsky.compiler.lexer.IntegerToken;
import rebelsky.compiler.lexer.RealToken;
import rebelsky.compiler.lexer.StringToken;
import rebelsky.compiler.lexer.Token;
import rebelsky.compiler.lexer.TokenException;
import rebelsky.compiler.lexer.TokenStream;
import rebelsky.compiler.misc.CharStream;
import rebelsky.compiler.misc.EndOfStream;

import java.util.Hashtable;

/**
 * A simple tokenizer for the Pascal language.
 *
 * @author Samuel A. Rebelsky
 * @version 1.2 of December 2003
 */
public class PascalTokenizer
  implements TokenStream
{
  // +----------------------+-----------------------------------------------
  // | Implementation Notes |
  // +----------------------+
/*
  This tokenizer is a fairly standard hand-coded tokenizer.  We
  repeatedly look at characters and decide what to do based on the
  character.  When we see a sequence of letters, numbers, and underscores,
  we check in a table (idsAndKeywords) to decide whether it's an identifier 
  or a keyword.

  To support peek(), I have a token buffer that holds the next token
  to be returned.  Most operations fill that buffer before doing anything
  else.  (next() clears the buffer, too.)

  To keep things efficient, we don't generate more than one copy of an
  equal token, instead relying on either the hash table or PascalTokens
  to provide the actual token.  By using this technique, we can compare
  tokens using == rather than equals.

  I've handled the "extra lookahead for .." problem with a fairly simple
  hack: If I see the dotdot symbol while parsing a number, I consume 
  the two dots and set dotdot_next to true.
 */

  // +-----------+----------------------------------------------------------
  // | Constants |
  // +-----------+

  /** The maximum number of characters in a number. */
  static final int MAX_DIGITS = 64;

  /** The maximum number of characters in an identifier. */
  static final int MAX_IDENTIFIER = 128;

  /** The maximum length of a string. */
  static final int MAX_STRING = 1024;


  // +--------+-------------------------------------------------------------
  // | Fields |
  // +--------+

  /** The character stream to be tokenized. */
  CharStream cs;

  /** A handy-dandy hash table that gives us all the string tokens. */
  Hashtable idsAndKeywords;

  /** A buffer of the next available token. */
  Token buffer;
 
  /** HACK!  Is the next token ".." ? */
  boolean dotdot_next = false;

  // +--------------+-------------------------------------------------------
  // | Constructors |
  // +--------------+

  /**
   * Build a new tokenizer that uses the given character stream.
   */
  public PascalTokenizer(CharStream cs) {
    // Prepare the character stream (including dropping opening spaces).
    this.cs = cs;
    try { this.skipSpaces(); } catch (Exception e) { }
    // Prepare the keyword table.  Arguably, this table should be
    // prepared by a helper procedure in PascalTokens, but I'm
    // too lazy to make my code more elegant right now.
    this.idsAndKeywords = new Hashtable();
    this.idsAndKeywords.put("and", PascalTokens.TAND);
    this.idsAndKeywords.put("array", PascalTokens.TARRAY);
    this.idsAndKeywords.put("begin", PascalTokens.TBEGIN);
    this.idsAndKeywords.put("case", PascalTokens.TCASE);
    this.idsAndKeywords.put("const", PascalTokens.TCONST);
    this.idsAndKeywords.put("div", PascalTokens.TDIV);
    this.idsAndKeywords.put("do", PascalTokens.TDO);
    this.idsAndKeywords.put("downto", PascalTokens.TDOWNTO);
    this.idsAndKeywords.put("else", PascalTokens.TELSE);
    this.idsAndKeywords.put("end", PascalTokens.TEND);
    this.idsAndKeywords.put("file", PascalTokens.TFILE);
    this.idsAndKeywords.put("for", PascalTokens.TFOR);
    this.idsAndKeywords.put("function", PascalTokens.TFUNCTION);
    this.idsAndKeywords.put("goto", PascalTokens.TGOTO);
    this.idsAndKeywords.put("if", PascalTokens.TIF);
    this.idsAndKeywords.put("in", PascalTokens.TIN);
    this.idsAndKeywords.put("label", PascalTokens.TLABEL);
    this.idsAndKeywords.put("mod", PascalTokens.TMOD);
    this.idsAndKeywords.put("nil", PascalTokens.TNIL);
    this.idsAndKeywords.put("not", PascalTokens.TNOT);
    this.idsAndKeywords.put("of", PascalTokens.TOF);
    this.idsAndKeywords.put("or", PascalTokens.TOR);
    this.idsAndKeywords.put("packed", PascalTokens.TPACKED);
    this.idsAndKeywords.put("procedure", PascalTokens.TPROCEDURE);
    this.idsAndKeywords.put("program", PascalTokens.TPROGRAM);
    this.idsAndKeywords.put("record", PascalTokens.TRECORD);
    this.idsAndKeywords.put("repeat", PascalTokens.TREPEAT);
    this.idsAndKeywords.put("set", PascalTokens.TSET);
    this.idsAndKeywords.put("then", PascalTokens.TTHEN);
    this.idsAndKeywords.put("to", PascalTokens.TTO);
    this.idsAndKeywords.put("type", PascalTokens.TTYPE);
    this.idsAndKeywords.put("until", PascalTokens.TUNTIL);
    this.idsAndKeywords.put("var", PascalTokens.TVAR);
    this.idsAndKeywords.put("while", PascalTokens.TWHILE);
    this.idsAndKeywords.put("with", PascalTokens.TWITH);
    // Why not include the standard types, too?
    this.idsAndKeywords.put("boolean", PascalTokens.TBOOLEAN);
    this.idsAndKeywords.put("char", PascalTokens.TCHAR);
    this.idsAndKeywords.put("integer", PascalTokens.TINTEGER);
    this.idsAndKeywords.put("real", PascalTokens.TREAL);
    this.idsAndKeywords.put("text", PascalTokens.TTEXT);
    // And my favorite constants.
    this.idsAndKeywords.put("false", PascalTokens.TFALSE);
    this.idsAndKeywords.put("true", PascalTokens.TTRUE);
    this.idsAndKeywords.put("maxint", PascalTokens.TMAXINT);
  } // PascalTokenizer(cs)


  // +-----------------+----------------------------------------------------
  // | Local Utilities |
  // +-----------------+

  /**
   * Crash and burn with an error message.  Probably not necessary,
   * but included so that this code can look more like my standard
   * pseudocode.
   */
  void boom(String message) 
    throws Exception
  {
    this.skipSpaces();
    throw new TokenException(message);
  } // boom(String)

  /**
   * Fill the buffer (if it's not already full).
   *
   * Pre: If the character stream has more characters, the next character
   *      is not whitespace.
   *
   * @exception EndOfStream
   *   If no tokens remain.
   * @exception TokenException
   *   If there are errors in tokenizing.
   * @exception Exception
   *   If I/O errors occurs.
   */
  void fillBuffer() 
    throws EndOfStream,TokenException,Exception
  {
    // Make sure the buffer is empty.
    if (buffer != null) return;
    // HACK! For the stupid ".." problem
    if (this.dotdot_next) {
      buffer = PascalTokens.TDOTDOT;
      this.dotdot_next = false;
      return;
    }
    // Make sure there's more input to consume.
    if (!cs.hasMore()) return;
    // Get the next character.
    char ch = cs.next();
    // Possibility 1: Identifier or keyword
    if (isAlpha(ch)) {
      char chars[] = new char[MAX_IDENTIFIER];
      int len = 1;
      chars[0] = ch;
      while ((len < MAX_IDENTIFIER) 
             && (cs.hasMore()) 
             && (isIdChar(cs.peek()))) {
        chars[len++] = cs.next();
      } // while
      // Sanity check: Was the identifier too long?
      if (len == MAX_IDENTIFIER) 
        boom("Identifiers are limited to " + MAX_IDENTIFIER + " chars.");
      // Convert the array into a nice friendly string
      String str = (new String(chars, 0, len)).toLowerCase();
      // Try looking it up in our handy dandy hash table.  This technique
      // serves two purposes: (1) It lets us figure out whether the string
      // is a keyword; (2) it ensures that two identifiers with the same
      // name get the same token.
      Token tok = (Token) idsAndKeywords.get(str);
      // In the table, so use it.
      if (tok != null) {
        buffer = tok;
      }
      // Not in the table.  Must be a new identifier.  Create it and
      // add it to the table.
      else {
        tok = new Identifier(str);
        idsAndKeywords.put(str,tok);
        buffer = tok;
      }
    } // if (isAlpha(ch))

    // Possibility 2: Number
    else if (isDigit(ch)) {
      char digits[] = new char[MAX_DIGITS];
      digits[0] = ch;
      int len = 1;
      boolean isReal = false;
      // Read any more digits
      while ((len < MAX_DIGITS) && (isDigit(cs.peek())))
        digits[len++] = cs.next();
      // Sanity check: Were there too many digits?
      if (len == MAX_DIGITS)
        boom("Numbers are limited to " + MAX_DIGITS + " characters.");
      // Hmmm ... could this be a real?  Look for the decimial point.
      if (cs.peek() == '.') {
        cs.next();
        // Check the next character.
        if (cs.peek() == '.') {
          // Panic!  We've consumed more than we should have.
          // Hack: consume the extra dot and make a note that the 
          // next token is DOTDOT.
          cs.next();
          this.dotdot_next = true;
          buffer = new IntegerToken(new String(digits, 0, len));
        } // if we've seen ".."
        else if (isDigit(cs.peek())) {
          if ((len+2) >= MAX_DIGITS) 
            boom("Numbers are limited to " + MAX_DIGITS + " characters.");
          // Success, digits after the decimal!
          isReal = true;
          digits[len++] = '.';
          digits[len++] = cs.next();
          while ((len < MAX_DIGITS) && (isDigit(cs.peek())))
            digits[len++] = cs.next();
        } // if we've seen ".[0-9]"
        else {
          // Hmmm ... I don't think you're allowed to have number period,
          // and we've consumed the damn period, so complain.
          boom("Sorry, Pascal doesn't allow number dot " + cs.peek());
        } // Saw something odd after the decimal point.
      } // If the next thing after the digits was '.'
      // New sanity check!
      if (len == MAX_DIGITS) 
        boom("Numbers are limited to " + MAX_DIGITS + " characters.");
      // Okay, now it's time to determine whether we have any
      // exponent.
      if ((buffer == null) && ((cs.peek() == 'E') || (cs.peek() == 'e'))) {
        if (len+2 >= MAX_DIGITS)
          boom("Numbers are limited to " + MAX_DIGITS + " characters.");
        isReal = true;
        digits[len++] = cs.next();  	// 'E'
        // Check for the sign.
        if ((cs.peek() == '+') || (cs.peek() == '-'))
          digits[len++] = cs.next();	// '+' or '-'
        // YA sanity check for the number of digits.
        if (len+1 >= MAX_DIGITS)
          boom("Numbers are limited to " + MAX_DIGITS + " characters.");
        // Sanity check: Next character must be a digit.
        if (!isDigit(cs.peek()))
          boom("Sorry, Pascal doesn't allow number dot " + cs.peek());
         // Consume the digit and any remaining digits.
        digits[len++] = cs.next();
        while ((len < MAX_DIGITS) && (isDigit(cs.peek())))
          digits[len++] = cs.next();
        // Everybody's favorite sanity check on the length.
        if (len == MAX_DIGITS)
          boom("Numbers are limited to " + MAX_DIGITS + " characters.");
      } // exponent
      // Finally, we're done with the parts of a number.  Deal with
      // the appropriate kind.
      String number = new String(digits, 0, len);
      if (isReal) {
        try { buffer = new RealToken(number); }
        catch (NumberFormatException e) { boom("Invalid integer: " + number); }
      }
      else { // if (!isReal)
        try { buffer = new IntegerToken(number); }
        catch (NumberFormatException e) { boom("Invalid real: " + number); }
      } // if (!isReal)
    } // if (isDigit(ch))

    // Possibility 3: String or Character
    else if (ch == '\'') {
      char chars[] = new char[MAX_STRING];
      int len = 0;
      while ((len < MAX_STRING) && ((ch = cs.next()) != '\'')) {
        if (ch == '\\') 
          chars[len++] = cs.next();
        else 
          chars[len++] = ch;
      } // while
      // Sanity check: Was the string too long?
      if (len == MAX_STRING) 
        boom("Strings are limited to " + MAX_STRING + " chars.");
      // And build the token (either a character or a string)
      if (len == 1) {
        buffer = new CharToken(chars[0]);
      } // Single-character string: A Character
      else {
        buffer = new StringToken(new String(chars, 0, len));
      } // Multiple-character string: A string
    } // if (ch == '\'')
    // Remaining possibilites: Something fun with symbols
    else {
      switch (ch) {
        // The simple ones: Only one character.
        case '+': buffer = PascalTokens.TPLUS;		break;
        case '-': buffer = PascalTokens.TMINUS;		break;
        case '*': buffer = PascalTokens.TTIMES;		break;
        case '/': buffer = PascalTokens.TDIVIDE;		break;
        case '(': buffer = PascalTokens.TOPENPAREN;	break;
        case ')': buffer = PascalTokens.TCLOSEPAREN;	break;
        case ',': buffer = PascalTokens.TCOMMA;		break;
        case ';': buffer = PascalTokens.TSEMICOLON;	break;
        case '^': buffer = PascalTokens.TPOINTER;	break;
        case '=': buffer = PascalTokens.TEQUALS;	break;
        case '[': buffer = PascalTokens.TOPENBRACKET;	break;
        case ']': buffer = PascalTokens.TCLOSEBRACKET;	break;
        // The more complex ones: One or two characters
        case ':':
          if (cs.hasMore() && (cs.peek() == '=')) {
            cs.next();
            buffer = PascalTokens.TBECOMES;
          }
          else
            buffer = PascalTokens.TCOLON;
          break;
        case '.':
          if (cs.hasMore() && (cs.peek() == '.')) {
            cs.next();
            buffer = PascalTokens.TDOTDOT;
          }
          else
            buffer = PascalTokens.TDOT;
          break;
        case '<':
          if (cs.hasMore() && (cs.peek() == '=')) {
            cs.next();
            buffer = PascalTokens.TLESSEQ;
          }
          else if (cs.hasMore() && (cs.peek() == '>')) {
            cs.next();
            buffer = PascalTokens.TNOTEQUALS;
          }
          else
            buffer = PascalTokens.TLESSTHAN;
          break;
        case '>':
          if (cs.hasMore() && (cs.peek() == '=')) {
            cs.next();
            buffer = PascalTokens.TGREATEREQ;
          }
          else
            buffer = PascalTokens.TGREATERTHAN;
          break;
        default:
          boom("Choking on '" + ch + "'\n");
          break;
      } // switch
    } // else
      
    // Clean up: Skip over spaces to prepare for next read.
    this.skipSpaces();
  } // fillBuffer()

  /**
   * Determine if a character is alphabetic
   */
  boolean isAlpha(char ch) {
    return Character.isLetter(ch);
  } // isAlpha(char)

  /**
   * Determine if a character is a digit.
   */
  boolean isDigit(char ch) {
    return Character.isDigit(ch);
  } // isDigit(char)

  /**
   * Determine if a character is an ID character.
   */
  boolean isIdChar(char ch) {
    return isAlpha(ch) || (ch == '_') || isDigit(ch);
  } // isIdChar(char ch)

  /**
   * Determine if a character is whitespace
   */
  boolean isSpace(char ch) {
    return (ch == ' ') || (ch == '\n') || (ch == '\t');
  } // isSpace(char)

  /**
   * Skip over whitespace and comments in the input.
   *
   * @throws EndOfStream
   *   If we skip to the end of the input.
   * @throws Exception
   *   If any I/O error occurs.
   */
  void skipSpaces() 
    throws EndOfStream, Exception
  {
    // Deal with whitespace first
    while (cs.hasMore() && isSpace(cs.peek()))
      cs.next();
    // Sanity check.  Make sure that there are more characters.
    if (!cs.hasMore()) return;
    // Hmmm ... any comments?
    if (cs.peek() == '{') {
      while (cs.hasMore() && (cs.peek() != '}'))
        cs.next();
      // Sanity check: Have we hit the end brace?
      if (!cs.hasMore()) return;
      // Drop the end brace
      cs.next();
      // Go back and do it all over again.
      skipSpaces();
    } // if there's a comment  
  } // skipSpaces()


  // +-----------+----------------------------------------------------------
  // | Observers |
  // +-----------+

  /**
   * Determine if the stream has any more tokens.
   */
  public boolean hasMore() {
    return (this.buffer != null) || (this.cs.hasMore());
  } // hasMore()

  /**
   * Peek at the next token.
   *
   * @exception EndOfStream
   *   If no tokens remain.
   * @exception TokenException
   *   If there are errors in tokenizing.
   * @exception Exception
   *   If I/O errors occurs.
   */
  public Token peek()
    throws EndOfStream,TokenException,Exception
  {
    this.fillBuffer();
    return this.buffer;
  } // peek(void)


  // +-----------+----------------------------------------------------------
  // | Modifiers |
  // +-----------+

  /**
   * Add a special token (keyword or identifier).
   */  
  public void addToken(String name, Token tok)
  {
    this.idsAndKeywords.put(name, tok);
  } // addToken(String,Token)

  /**
   * Get the next token, advancing the input stream to the following
   * token.
   *
   * @exception EndOfStream
   *   If no tokens remain.
   * @exception TokenException
   *   If there are errors in tokenizing.
   * @exception Exception
   *   If I/O errors occurs.
   */
  public Token next()
    throws EndOfStream, TokenException, Exception
  {
    this.fillBuffer();
    Token tok = this.buffer;
    this.buffer = null;
    return tok;
  } // next(void)
} // class PascalTokenizer

