import java.util.Hashtable;

/**
 * A simple tokenizer for the Pascal language.
 *
 * @author Samuel A. Rebelsky
 * @version 1.0 of October 2002
 */
public class PascalTokenizer
  implements TokenStream
{
  // +-----------+----------------------------------------------------------
  // | Constants |
  // +-----------+

  // +--------+-------------------------------------------------------------
  // | Fields |
  // +--------+

  /** The character stream to be tokenized. */
  CharStream cs;

  /** A handy-dandy hash table that gives us all the string tokens. */
  Hashtable idsAndKeywords;

  /** A buffer of the next available token. */
  Token buffer;
 
  /** HACK!  Is the next token an ellipses? */
  boolean ellipses_next = false;

  // +--------------+-------------------------------------------------------
  // | Constructors |
  // +--------------+

  /**
   * Build a new tokenizer that uses the given character stream.
   */
  public PascalTokenizer(CharStream cs) {
    // Prepare the character stream (including dropping opening spaces).
    this.cs = cs;
    try { this.skipSpaces(); } catch (Exception e) { }
    // Prepare the keyword table.
    this.idsAndKeywords = new Hashtable();
    this.idsAndKeywords.put("and", PascalTokens.TAND);
    this.idsAndKeywords.put("array", PascalTokens.TARRAY);
    this.idsAndKeywords.put("begin", PascalTokens.TBEGIN);
    this.idsAndKeywords.put("case", PascalTokens.TCASE);
    this.idsAndKeywords.put("const", PascalTokens.TCONST);
    this.idsAndKeywords.put("div", PascalTokens.TDIV);
    this.idsAndKeywords.put("do", PascalTokens.TDO);
    this.idsAndKeywords.put("downto", PascalTokens.TDOWNTO);
    this.idsAndKeywords.put("else", PascalTokens.TELSE);
    this.idsAndKeywords.put("end", PascalTokens.TEND);
    this.idsAndKeywords.put("false", PascalTokens.TFALSE);
    this.idsAndKeywords.put("file", PascalTokens.TFILE);
    this.idsAndKeywords.put("for", PascalTokens.TFOR);
    this.idsAndKeywords.put("function", PascalTokens.TFUNCTION);
    this.idsAndKeywords.put("goto", PascalTokens.TGOTO);
    this.idsAndKeywords.put("if", PascalTokens.TIF);
    this.idsAndKeywords.put("in", PascalTokens.TIN);
    this.idsAndKeywords.put("label", PascalTokens.TLABEL);
    this.idsAndKeywords.put("maxint", PascalTokens.TMAXINT);
    this.idsAndKeywords.put("mod", PascalTokens.TMOD);
    this.idsAndKeywords.put("nil", PascalTokens.TNIL);
    this.idsAndKeywords.put("not", PascalTokens.TNOT);
    this.idsAndKeywords.put("of", PascalTokens.TOF);
    this.idsAndKeywords.put("or", PascalTokens.TOR);
    this.idsAndKeywords.put("packed", PascalTokens.TPACKED);
    this.idsAndKeywords.put("program", PascalTokens.TPROGRAM);
    this.idsAndKeywords.put("record", PascalTokens.TRECORD);
    this.idsAndKeywords.put("repeat", PascalTokens.TREPEAT);
    this.idsAndKeywords.put("set", PascalTokens.TSET);
    this.idsAndKeywords.put("text", PascalTokens.TTEXT);
    this.idsAndKeywords.put("then", PascalTokens.TTHEN);
    this.idsAndKeywords.put("to", PascalTokens.TTO);
    this.idsAndKeywords.put("type", PascalTokens.TTYPE);
    this.idsAndKeywords.put("until", PascalTokens.TUNTIL);
    this.idsAndKeywords.put("var", PascalTokens.TVAR);
    this.idsAndKeywords.put("while", PascalTokens.TWHILE);
    this.idsAndKeywords.put("with", PascalTokens.TWITH);

    this.idsAndKeywords.put("boolean", PascalTokens.TBOOLEAN);
    this.idsAndKeywords.put("char", PascalTokens.TCHAR);
    this.idsAndKeywords.put("integer", PascalTokens.TINTEGER);
    this.idsAndKeywords.put("real", PascalTokens.TREAL);
  } // PascalTokenizer(cs)


  // +-----------------+----------------------------------------------------
  // | Local Utilities |
  // +-----------------+

  /**
   * Crash and burn with an error message.
   */
  void boom(String message) 
    throws Exception
  {
    this.skipSpaces();
    throw new TokenException(message);
  } // boom(String)

  /**
   * Fill the buffer (if it's not already full).
   *
   * Pre: If the character stream has more characters, the next character
   *      is not whitespace.
   *
   * @exception EndOfStream
   *   If no tokens remain.
   * @exception TokenException
   *   If there are errors in tokenizing.
   * @exception Exception
   *   If I/O errors occurs.
   */
  void fillBuffer() 
    throws EndOfStream,TokenException,Exception
  {
    // Make sure the buffer is empty.
    if (buffer != null) return;
    // HACK! For the stupid ellipses problem
    if (this.ellipses_next) {
      buffer = PascalTokens.TELLIPSES;
      this.ellipses_next = false;
      return;
    }
    // Make sure there's more input to consume.
    if (!cs.hasMore()) return;
    // Get the next character.
    char ch = cs.next();
    // Possibility 1: Identifier or keyword
    if (isAlpha(ch)) {
      char chars[] = new char[256];
      int len = 1;
      chars[0] = ch;
      while ((len < 256) && (cs.hasMore()) && (isIdChar(cs.peek()))) {
        chars[len++] = cs.next();
      } // while
      // Sanity check: Was the identifier too long?
      if (len == 256) boom("Identifiers are limited to 256 chars.");
      // Convert the array into a nice friendly string
      String str = (new String(chars, 0, len)).toLowerCase();
      // Try looking it up in our handy dandy hash table.  This technique
      // serves two purposes: (1) It lets us figure out whether the string
      // is a keywords; (2) it ensures that two identifiers with the same
      // name get the same token.
      Token tok = (Token) idsAndKeywords.get(str);
      // In the table, so use it.
      if (tok != null) {
        buffer = tok;
      }
      // Not in the table.  Must be a new identifier.  Create it and
      // add it to the table.
      else {
        tok = new PascalIdentifier(str);
        idsAndKeywords.put(str,tok);
        buffer = tok;
      }
    } // if (isAlpha(ch))
    // Possibility 2: Number
    // Possibility 3: String
    else if (ch == '\'') {
      char chars[] = new char[1024];
      int len = 0;
      while ((len < 1024) && ((ch = cs.next()) != '\'')) {
        if (ch == '\\') 
          chars[len++] = cs.next();
        else 
          chars[len++] = ch;
      } // while
      // Sanity check: Was the string too long?
      if (len == 1024) boom("Strings are limited to 1024 chars.");
      // And build the token
      buffer = new PascalString(new String(chars, 0, len));
    } // if (ch == '\'')
    // Remaining possibilites: Something fun with symbols
    else {
      switch (ch) {
        // The simple ones: Only one character.
        case '+': buffer = PascalTokens.TPLUS;		break;
        case '-': buffer = PascalTokens.TMINUS;		break;
        case '*': buffer = PascalTokens.TTIMES;		break;
        case '(': buffer = PascalTokens.TOPENPAREN;	break;
        case ')': buffer = PascalTokens.TCLOSEPAREN;	break;
        case ',': buffer = PascalTokens.TCOMMA;		break;
        case ';': buffer = PascalTokens.TSEMI;		break;
        case '^': buffer = PascalTokens.TPOINTER;	break;
        case '=': buffer = PascalTokens.TEQUALS;	break;
        case '[': buffer = PascalTokens.TOPENBRACKET;	break;
        case ']': buffer = PascalTokens.TCLOSEBRACKET;	break;
        // The more complex ones: One or two characters
        case ':':
          if (cs.hasMore() && (cs.peek() == '=')) {
            cs.next();
            buffer = PascalTokens.TASSIGN;
          }
          else
            buffer = PascalTokens.TCOLON;
          break;
        case '.':
          if (cs.hasMore() && (cs.peek() == '.')) {
            cs.next();
            buffer = PascalTokens.TELLIPSES;
          }
          else
            buffer = PascalTokens.TPERIOD;
          break;
        case '<':
          if (cs.hasMore() && (cs.peek() == '=')) {
            cs.next();
            buffer = PascalTokens.TLESSEQ;
          }
          else if (cs.hasMore() && (cs.peek() == '>')) {
            cs.next();
            buffer = PascalTokens.TNOTEQUALS;
          }
          else
            buffer = PascalTokens.TLESSTHAN;
          break;
        case '>':
          if (cs.hasMore() && (cs.peek() == '=')) {
            cs.next();
            buffer = PascalTokens.TGREATEREQ;
          }
          else
            buffer = PascalTokens.TGREATERTHAN;
          break;
        default:
          boom("Choking on '" + ch + "'\n");
          break;
      } // switch
    } // else
      
    // Clean up: Skip over spaces to prepare for next read.
    this.skipSpaces();
  } // fillBuffer()

  /**
   * Determine if a character is alphabetic
   */
  boolean isAlpha(char ch) {
    return Character.isLetter(ch);
  } // isAlpha(char)

  /**
   * Determine if a character is a digit.
   */
  boolean isDigit(char ch) {
    return Character.isDigit(ch);
  } // isDigit(char)

  /**
   * Determine if a character is an ID character.
   */
  boolean isIdChar(char ch) {
    return isAlpha(ch) || (ch == '_') || isDigit(ch);
  } // isIdChar(char ch)

  /**
   * Determine if a character is whitespace
   */
  boolean isSpace(char ch) {
    return (ch == ' ') || (ch == '\n') || (ch == '\t');
  } // isSpace(char)

  /**
   * Skip over whitespace and comments in the input.
   *
   * @throws EndOfStream
   *   If we skip to the end of the input.
   * @throws Exception
   *   If any I/O error occurs.
   */
  void skipSpaces() 
    throws EndOfStream, Exception
  {
    // Deal with whitespace first
    while (cs.hasMore() && isSpace(cs.peek()))
      cs.next();
    // Sanity check.  Make sure that there are more characters.
    if (!cs.hasMore()) return;
    // Hmmm ... any comments?
    if (cs.peek() == '{') {
      while (cs.hasMore() && (cs.peek() != '}'))
        cs.next();
      // Sanity check: Have we hit the end brace?
      if (!cs.hasMore()) return;
      // Drop the end brace
      cs.next();
      // Go back and do it all over again.
      skipSpaces();
    } // if there's a comment  
  } // skipSpaces()


  // +-----------+----------------------------------------------------------
  // | Observers |
  // +-----------+

  /**
   * Determine if the stream has any more tokens.
   */
  public boolean hasMore() {
    return (this.buffer != null) || (this.cs.hasMore());
  } // hasMore()

  /**
   * Peek at the next token.
   *
   * @exception EndOfStream
   *   If no tokens remain.
   * @exception TokenException
   *   If there are errors in tokenizing.
   * @exception Exception
   *   If I/O errors occurs.
   */
  public Token peek()
    throws EndOfStream,TokenException,Exception
  {
    this.fillBuffer();
    return this.buffer;
  } // peek(void)


  // +-----------+----------------------------------------------------------
  // | Modifiers |
  // +-----------+

  /**
   * Get the next token, advancing the input stream to the following
   * token.
   *
   * @exception EndOfStream
   *   If no tokens remain.
   * @exception TokenException
   *   If there are errors in tokenizing.
   * @exception Exception
   *   If I/O errors occurs.
   */
  public Token next()
    throws EndOfStream, TokenException, Exception
  {
    this.fillBuffer();
    Token tok = this.buffer;
    this.buffer = null;
    return tok;
  } // next(void)
} // class PascalTokenizer

