/*	***************************************************************************

	PROJECT:	Joker
	
	FILE:		TalkTokenize.cpp
	
	PURPOSE:	Tokenizer part of the HyperTalk interpreter.
		
	COPYRIGHT:	(C) Copyright 1999 by M. Uli Kusterer, all rights reserved.
				
	REACH ME AT:
				E-MAIL:		witness@weblayout.com
				URL:		http://www.weblayout.com/witness
	
	
	REVISIONS:
		1999-02-14	UK		Created.
				
	************************************************************************ */

#pragma mark [Headers]

/* --------------------------------------------------------------------------------
	Headers:
   ----------------------------------------------------------------------------- */

#include	"HyperTalk.h"
#include	"TalkTokenize.h"
#include	"parse_error.h"
#include	<iostream>
#include	<stack>
#include	"strcase.h"		// Our own strcasecmp().


#pragma mark [Globals]

/* --------------------------------------------------------------------------------
	Globals / Static variables:
   ----------------------------------------------------------------------------- */

TalkIdentifierList		TalkTokenizer::mIdentifiers;
char					TalkTokenizer::mNewlineChar = '\n';
long					TalkTokenizer::mTokenIDSeed;

#pragma mark -
#pragma mark [Implementation]


/* --------------------------------------------------------------------------------
	Init:
		Register all identifier tokens the tokenizer needs to know about.
	
	REVISIONS:
		2001-01-18	UK		Added "do".
		2001-01-15	UK		Added some loop tokens.
		2000-10-23	UK		Changed to use token list instead of manual look-up.
		2000-10-21	UK		Added "the" and "result".
		1999-12-30	UK		Added char/character.
		1999-06-22	UK		Created.
   ----------------------------------------------------------------------------- */

void	TalkTokenizer::Init()
{
	if( mIdentifiers.size() != 0 )
		return;	// Don't try to init twice.
	
	TalkTokenizer::RegisterIdentifierToken( TextMunger("on"), TOKEN_TYPE_START_HANDLER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("end"), TOKEN_TYPE_END );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("put"), TOKEN_TYPE_PUT_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("into"), TOKEN_TYPE_INTO_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("if"), TOKEN_TYPE_IF_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("then"), TOKEN_TYPE_THEN_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("else"), TOKEN_TYPE_ELSE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("repeat"), TOKEN_TYPE_REPEAT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("while"), TOKEN_TYPE_WHILE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("entry"), TOKEN_TYPE_ENTRY_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("element"), TOKEN_TYPE_ENTRY_IDENTIFIER );	// element is a synonym for entry.
	TalkTokenizer::RegisterIdentifierToken( TextMunger("of"), TOKEN_TYPE_OF_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("param"), TOKEN_TYPE_PARAM_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("parameter"), TOKEN_TYPE_PARAM_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("return"), TOKEN_TYPE_RETURN_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("stdin"), TOKEN_TYPE_STDIN_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("standardinput"), TOKEN_TYPE_STDIN_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("function"), TOKEN_TYPE_START_FUNCTION );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("global"), TOKEN_TYPE_GLOBAL_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("char"), TOKEN_TYPE_CHAR_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("character"), TOKEN_TYPE_CHAR_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("to"), TOKEN_TYPE_TO_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("word"), TOKEN_TYPE_WORD_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("item"), TOKEN_TYPE_ITEM_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("line"), TOKEN_TYPE_LINE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("the"), TOKEN_TYPE_THE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("result"), TOKEN_TYPE_RESULT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("keys"), TOKEN_TYPE_KEYS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("chars"), TOKEN_TYPE_CHARS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("characters"), TOKEN_TYPE_CHARS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("words"), TOKEN_TYPE_WORDS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("items"), TOKEN_TYPE_ITEMS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("lines"), TOKEN_TYPE_LINES_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("number"), TOKEN_TYPE_NUMBER_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("url"), TOKEN_TYPE_URL_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("true"), TOKEN_TYPE_TRUE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("false"), TOKEN_TYPE_FALSE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("set"), TOKEN_TYPE_SET_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("add"), TOKEN_TYPE_ADD_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("subtract"), TOKEN_TYPE_SUBTRACT_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("multiply"), TOKEN_TYPE_MULTIPLY_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("divide"), TOKEN_TYPE_DIVIDE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("from"), TOKEN_TYPE_FROM_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("by"), TOKEN_TYPE_BY_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("replace"), TOKEN_TYPE_REPLACE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("with"), TOKEN_TYPE_WITH_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("in"), TOKEN_TYPE_IN_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("reply"), TOKEN_TYPE_REPLY_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("keyword"), TOKEN_TYPE_KEYWORD_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("error"), TOKEN_TYPE_ERROR_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("pass"), TOKEN_TYPE_PASS_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("request"), TOKEN_TYPE_REQUEST_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("ae"), TOKEN_TYPE_APPLEEVT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("appleevent"), TOKEN_TYPE_APPLEEVT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("stack"), TOKEN_TYPE_STACK_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("card"), TOKEN_TYPE_CARD_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("cd"), TOKEN_TYPE_CARD_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("background"), TOKEN_TYPE_BKGND_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("bkgnd"), TOKEN_TYPE_BKGND_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("bkgd"), TOKEN_TYPE_BKGND_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("bg"), TOKEN_TYPE_BKGND_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("ask"), TOKEN_TYPE_ASK_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("this"), TOKEN_TYPE_THIS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("define"), TOKEN_TYPE_DEFINE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("undefine"), TOKEN_TYPE_UNDEFINE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("create"), TOKEN_TYPE_CREATE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("menu"), TOKEN_TYPE_MENU_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("part"), TOKEN_TYPE_PART_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("menuitem"), TOKEN_TYPE_MENUITEM_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("down"), TOKEN_TYPE_DOWN_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("is"), TOKEN_TYPE_IS_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("do"), TOKEN_TYPE_DO_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("debug"), TOKEN_TYPE_DEBUG_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("checkpoint"), TOKEN_TYPE_CHECKPOINT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("window"), TOKEN_TYPE_WINDOW_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("wd"), TOKEN_TYPE_WINDOW_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("wnd"), TOKEN_TYPE_WINDOW_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("quit"), TOKEN_TYPE_QUIT_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("center"), TOKEN_TYPE_CENTER_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("close"), TOKEN_TYPE_CLOSE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("show"), TOKEN_TYPE_SHOW_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("hide"), TOKEN_TYPE_HIDE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("disable"), TOKEN_TYPE_DISABLE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("document"), TOKEN_TYPE_DOCUMENT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("palette"), TOKEN_TYPE_PALETTE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("exit"), TOKEN_TYPE_EXIT_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("setWindow"), TOKEN_TYPE_SETWINDOW_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("not"), TOKEN_TYPE_NOT_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("s"), TOKEN_TYPE_S_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("me"), TOKEN_TYPE_ME_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("button"), TOKEN_TYPE_BUTTON_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("btn"), TOKEN_TYPE_BUTTON_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("select"), TOKEN_TYPE_SELECT_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("delete"), TOKEN_TYPE_DELETE_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("local"), TOKEN_TYPE_LOCAL_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("and"), TOKEN_TYPE_AND_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("or"), TOKEN_TYPE_OR_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("byte"), TOKEN_TYPE_BYTE_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("bytes"), TOKEN_TYPE_BYTES_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("try"), TOKEN_TYPE_TRY_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("catch"), TOKEN_TYPE_CATCH_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("finally"), TOKEN_TYPE_FINALLY_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("throw"), TOKEN_TYPE_THROW_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("via"), TOKEN_TYPE_VIA_IDENTIFIER );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("output"), TOKEN_TYPE_OUTPUT_COMMAND );
	TalkTokenizer::RegisterIdentifierToken( TextMunger("send"), TOKEN_TYPE_SEND_COMMAND );
	
	mTokenIDSeed = TOKEN_TYPE_LAST;
}


/* --------------------------------------------------------------------------------
	Tokenize:
		Tokenize the specified script.
		
		This breaks the stream of characters that is the input script into a
		list of base units called "tokens", and performs some initial labeling
		of them. A token is kind of like a word in human language, but strings,
		which may consist of several words, make up only one token, and
		punctuation is usually considered to be a separate token, splitting
		in half any token it might be embedded in (the exception, again, being
		strings). Also note that this is where "line continuation" characters
		are stripped from the script along with comments. Since this happens
		here, the line numbers reported on errors may be slightly off.
	
	TAKES:
		inScript	-	The text of the script to be tokenized.
	
	REVISIONS:
		2002-03-27	UK		Added support for double-apostrophe as alternative
							to quote as string start/end token.
		1999-06-22	UK		Created.
   ----------------------------------------------------------------------------- */

void	HyperTalk::Tokenize( TextMunger& inScript )
{
	std::size_t	vStartOffs = 0,
				vEndOffs = 0;
	char		vCurrToken;
	std::size_t	vOldOffs;
	bool		vIsInQuote = false,
				vIsInSpecialQuote = false,
				vIsValidNumber = true;
	
	mTokens.clear();
	
	vOldOffs = inScript.GetOffset();
	inScript.SetOffset( 0 );
	
	while( inScript.GetOffset() < inScript.GetLength() )
	{
		vCurrToken = inScript.Read<char>();
		
		/* Everything between quotes mustn't be parsed; spaces mustn't end the token
			inside a quoted string, etc. That's why we clear vCurrToken. REMEMBER
			THIS WHEN PASSING THE CURRENT TOKEN SOMEWHERE WHILE QUOTES ARE OPEN! */
		if( (vIsInQuote && vCurrToken != '"' && vCurrToken != '\\')
			|| (vIsInSpecialQuote && vCurrToken != '\'' && vCurrToken != '\\') )
			vCurrToken = 0;		// Causes default to happen.
		
		switch( vCurrToken )
		{
			// Whitespace ends any token:
			case ' ':
			case '\t':
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );	// Whitespace ends any token.
				vIsValidNumber = true;	// Reset number boolean.
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
			
			
			case '\n':
			case '\r':
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );	// Newline ends any token.
				AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_NEWLINE );
				vIsValidNumber = true;
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
            
			case OPERATOR_CHAR_OPEN_BRACE:		// (     either just bracket or start of multi-line comment "(* foo *)".
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );
				if( inScript.Peek<char>() == OPERATOR_CHAR_MUL )	// Multi-line comment!
				{
					inScript.AdvanceOffset( 1 );
					
					// Skip everything until we reach an asterisk and a close bracket:
					while( inScript.GetOffset() < inScript.GetLength() )
					{
						if( inScript.Peek<char>() == OPERATOR_CHAR_MUL )
                        {
                            inScript.AdvanceOffset( 1 );
                            if( (inScript.GetOffset() < inScript.GetLength())
                                && (inScript.Peek<char>() == OPERATOR_CHAR_CLOSE_BRACE) )
                            {
                                inScript.AdvanceOffset( 1 );
                                break;	// Exit this loop.
                            }
                        }
                        else
                            inScript.AdvanceOffset( 1 );
					}
				}
				else
					AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_OPERATOR, vCurrToken );
				vIsValidNumber = true;
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
			
			case OPERATOR_CHAR_EQUAL:			// =
			case OPERATOR_CHAR_PLUS:			// +
			case OPERATOR_CHAR_MUL:				// *
			case OPERATOR_CHAR_DIV:				// /
			case OPERATOR_CHAR_CLOSE_BRACE:		// )
			case OPERATOR_CHAR_OPEN_ABRACE:		// [
			case OPERATOR_CHAR_CLOSE_ABRACE:	// ]
			case OPERATOR_CHAR_MOD:				// %
			case OPERATOR_CHAR_COMMA:			// ,
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );	// Operator ends any token.
				AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_OPERATOR, vCurrToken );
				vIsValidNumber = true;
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
			
			case OPERATOR_CHAR_APOSTROPHE:		// '
				if( inScript.Peek<char>() == OPERATOR_CHAR_APOSTROPHE )
				{
					inScript.AdvanceOffset( 1 );
					
					if( vIsInSpecialQuote )	// This ends a string, generate a string token.
					{
						MakeStringToken( inScript, vStartOffs, vEndOffs );
						vIsValidNumber = true;
					}
					else		// Quote ends any token.
					{
						if( vIsValidNumber )
							MakeNumberToken( inScript, vStartOffs, vEndOffs );
						else
							ProcessToken( inScript, vStartOffs, vEndOffs );
					}
					vStartOffs = inScript.GetOffset();	// Just skip quotes.
					vEndOffs = vStartOffs;
					vIsInSpecialQuote = !vIsInSpecialQuote;	// Toggle state of quotes.
				}
				else if( vIsInSpecialQuote )	// It was a single apostrophe inside a string?
					vEndOffs++;	// Pretend we processed it like any other char.
				else
				{
					if( vIsValidNumber )
						MakeNumberToken( inScript, vStartOffs, vEndOffs );
					else
						ProcessToken( inScript, vStartOffs, vEndOffs );	// Operator ends any token.
					AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_OPERATOR, vCurrToken );
					vIsValidNumber = true;
					vStartOffs = inScript.GetOffset();
					vEndOffs = vStartOffs;
				}
				break;
			
			case OPERATOR_CHAR_LESS_THAN:		// <
			case OPERATOR_CHAR_GREATER_THAN:	// >
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );	// Operator ends any token.
				if( inScript.Peek<char>() == OPERATOR_CHAR_EQUAL )	// Greater/same or Less/same!
				{
					inScript.AdvanceOffset( 1 );
					
					switch( vCurrToken )
					{
						case OPERATOR_CHAR_LESS_THAN:		// <=
							vCurrToken = OPERATOR_CHAR_LT_EQUAL;
							break;
						
						case OPERATOR_CHAR_GREATER_THAN:	// >=
							vCurrToken = OPERATOR_CHAR_GT_EQUAL;
							break;
					}
				}
				else if( inScript.Peek<char>() == '>' )
				{
					inScript.AdvanceOffset( 1 );
					
					switch( vCurrToken )
					{
						case OPERATOR_CHAR_LESS_THAN:		// <>
							vCurrToken = OPERATOR_CHAR_NOT_EQUAL;
							break;
						
						case OPERATOR_CHAR_GREATER_THAN:	// >>
							vCurrToken = OPERATOR_CHAR_BSR;
							break;
					}
				}
				else if( inScript.Peek<char>() == '<' )
				{
					inScript.AdvanceOffset( 1 );
					
					switch( vCurrToken )
					{
						case OPERATOR_CHAR_LESS_THAN:	// <<
							vCurrToken = OPERATOR_CHAR_BSL;
							break;
					}
				}
				AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_OPERATOR, vCurrToken );
				vIsValidNumber = true;
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
			
			case '"':
				if( vIsInQuote )	// This ends a string, generate a string token.
				{
					MakeStringToken( inScript, vStartOffs, vEndOffs );
					vIsValidNumber = true;
				}
				else		// Quote ends any token.
				{
					if( vIsValidNumber )
						MakeNumberToken( inScript, vStartOffs, vEndOffs );
					else
						ProcessToken( inScript, vStartOffs, vEndOffs );
				}
				vStartOffs = inScript.GetOffset();	// Just skip quotes.
				vEndOffs = vStartOffs;
				vIsInQuote = !vIsInQuote;	// Toggle state of quotes.
				break;
			
			case OPERATOR_CHAR_MINUS:
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );
				if( inScript.Peek<char>() == OPERATOR_CHAR_MINUS )	// Comment!
				{
					inScript.AdvanceOffset( 1 );
					
					// Skip remainder of line since it's just comment text.
					while( inScript.Peek<char>() != '\n'
							&& inScript.Peek<char>() != '\r'
							&& inScript.GetOffset() < inScript.GetLength() )
					{
						inScript.AdvanceOffset( 1 );
					}
					
					vEndOffs = inScript.GetOffset();
					vStartOffs = vEndOffs;
					
					// If this isn't end of text, skip return character, too.
					if( inScript.GetOffset() < inScript.GetLength() )
						inScript.AdvanceOffset( 1 );
					
					// Add end-of-line token at end of comment.
					AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_NEWLINE );
				}
				else
					AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_OPERATOR, vCurrToken );
				vIsValidNumber = true;
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
			
			case OPERATOR_CHAR_CONCAT:
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );
				if( inScript.Peek<char>() == OPERATOR_CHAR_CONCAT )	// &&, space-concat-operator!
				{
					inScript.AdvanceOffset( 1 );
					vEndOffs = inScript.GetOffset();
					
					vCurrToken = OPERATOR_CHAR_SPACE_CONCAT;	// Dummy token to indicate '&&' operator.
				}
				
				AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_OPERATOR, vCurrToken );
				vIsValidNumber = true;
				vStartOffs = inScript.GetOffset();
				vEndOffs = vStartOffs;
				break;
			
			case '#':	// Alternative, Unix-style comment character:
				if( vIsValidNumber )
					MakeNumberToken( inScript, vStartOffs, vEndOffs );
				else
					ProcessToken( inScript, vStartOffs, vEndOffs );
				// Skip remainder of line since it's just comment text.
				while( inScript.Peek<char>() != '\n'
						&& inScript.Peek<char>() != '\r'
						&& inScript.GetOffset() < inScript.GetLength() )
				{
					inScript.AdvanceOffset( 1 );
				}
					
				vEndOffs = inScript.GetOffset();
				vStartOffs = vEndOffs;
				vIsValidNumber = true;
				// If this isn't end of text, skip return character, too.
				if( inScript.GetOffset() < inScript.GetLength() )
					inScript.AdvanceOffset( 1 );
				
				// Add end-of-line token at end of comment.
				AddToken( vStartOffs, vEndOffs, TOKEN_TYPE_NEWLINE );
				break;
			
			case '\\':	// Line continuation character or string escape sequence.
				if( sOptions[OPT_ESCAPE_CHARS_IN_STRINGS] && (vIsInQuote || vIsInSpecialQuote) )	// Escape sequence in string and escape sequences are on?
				{
					if( (inScript.GetOffset() +1) < inScript.GetLength() )	// There is a char after this one, right?
						vEndOffs += 2;	// This is passed to inScript.SetOffset() at loop's end. No need to AdvanceOffset() ourselves.
					else
						throw parse_error( "A backslash in a string must be followed by at least one other character.", vStartOffs, vEndOffs );
				}
				else	// Otherwise? Must be line continuation.
				{
					if( vIsValidNumber )
						MakeNumberToken( inScript, vStartOffs, vEndOffs );
					else
						ProcessToken( inScript, vStartOffs, vEndOffs );	// Continuation ends any token.
					// Now skip everything until next return, so user can have comments after continued lines:
					while( inScript.Peek<char>() != '\r'
							&& inScript.Peek<char>() != '\n'
							&& inScript.GetOffset() < inScript.GetLength() )
						inScript.AdvanceOffset(1);
					
					// If this isn't end of text, skip return character, too.
					if( inScript.GetOffset() < inScript.GetLength() )
						inScript.AdvanceOffset( 1 );
					
					// We're now past return -- go on as if this had just been a space:
					vIsValidNumber = true;
					vStartOffs = inScript.GetOffset();
					vEndOffs = vStartOffs;
				}
				break;
			
			case '1':
			case '2':
			case '3':
			case '4':
			case '5':
			case '6':
			case '7':
			case '8':
			case '9':
			case '0':
				/* We don't change vIsValidNumber here. If this is a fresh token,
					vIsValidNumber has already been inited to TRUE. If the token has
					already started the only event that can set vIsValidNumber
					to FALSE is if a character wasn't a number, in which case we
					want to keep it this way anyhow. But this separate case label
					avoids that we get into the default clause that makes it an
					invalid number. */
				vEndOffs++;
				break;
			
			default:	// This mustn't use vCurrToken since that is always 0 in quotes.
				vIsValidNumber = false;
				vEndOffs++;	// Just take this into this token, too.
		}
		
		inScript.SetOffset( vEndOffs );
	}
	
	// End last token properly:
	if( vIsInQuote || vIsInSpecialQuote )	// It's a string?
		MakeStringToken( inScript, vStartOffs, vEndOffs );
	else if( vIsValidNumber )
		MakeNumberToken( inScript, vStartOffs, vEndOffs );
	else	// Any other token:
		ProcessToken( inScript, vStartOffs, vEndOffs );	// Quote ends any token.
	
	inScript.SetOffset( vOldOffs );
}


/* --------------------------------------------------------------------------------
	ProcessToken:
		Look up the token the passed string corresponds to.
	
	REVISIONS:
		2000-10-23	UK		Changed to use token list instead of manual look-up.
		2000-10-21	UK		Added "the".
		1999-06-22	UK		Created.
   ----------------------------------------------------------------------------- */

void	HyperTalk::ProcessToken( TextMunger& inScript, std::size_t start, std::size_t end )
{
	std::size_t		vTokenLength = end -start;
	
	if( vTokenLength == 0 )	// No token.
		return;
	
	TextMunger		vTheToken( inScript, start, vTokenLength );	// Shares data with inScript.
	
	vTheToken.SetOffset( 0 );
	vTheToken.RecalcHash();	// Make sure compares are faster by adding hashes:
	MakeStringToken( inScript, start, end, TalkTokenizer::GetIdentifierType(vTheToken) );
}


/* --------------------------------------------------------------------------------
	MakeStringToken:
		Create a string/identifier token and append it to our list of tokens. This
		token contains the text specified in the script in a TextMunger as its
		value. This allows using special identifiers as unquoted string literals
		or other tokens, i.e. make every identifier token using this method so the
		parser can use the token as a generic identifier in other contexts.
	
	TAKES:
		inScript	-	The script text being tokenized.
		start		-	The starting offset of this token in the script.
		end			-	The ending offset of this token in the script.
		type		-	The type to give this token. Defaults to TOKEN_TYPE_STRING.
	
	REVISIONS:
		1999-06-22	UK	Created.
   ----------------------------------------------------------------------------- */

void	HyperTalk::MakeStringToken( TextMunger& inScript, std::size_t start, std::size_t end,
									TokenTypeEnum type )
{
	std::size_t	vStringLength = end -start;
	char*		vString;
	
	vString = (char*) malloc( vStringLength +1 );
	if( vString )
	{
		std::size_t		vOldOffs;
		
		if( vStringLength != 0 )
		{
			vOldOffs = inScript.GetOffset();
			inScript.SetOffset( start );
			inScript.ReadData( vString, vStringLength );
			inScript.SetOffset( vOldOffs );
		}
		
		vString[vStringLength] = 0;	// Terminate string.
		
		/* If the option to support escape sequences is on and this is a string token,
			replace all escaped characters with their proper equivalent:	*/
		if( type == TOKEN_TYPE_STRING && sOptions[OPT_ESCAPE_CHARS_IN_STRINGS] == true )
			TalkTokenizer::ReplaceEscapeSequences( vString );
		
		// Actually add token to token list.
		AddToken( start, end, type, (long) vString );
	}
	else
		throw parse_error( "Couldn't allocate memory for string.", start, end );
}


/* --------------------------------------------------------------------------------
	ReplaceEscapeSequences:
		Replace escape sequences in a string. An escape sequence works like in C,
		i.e. it is indicated by a backslash, followed by a single character that
		specifies what character it represents. The supported chars are listed
		below, with the equivalent xTalk constant next to them:
		
		r		-	return
		l		-	lf
		n		-	newline
		t		-	tab
		"		-	quote (")
		' '		-	space
		'		-	'
		\		-	\
		
		All others are re-mapped to a space, but officially this behaviour is
		undefined.
		
		NOTE:	Escaped character sequences also replace any returns in the string
				with spaces, so when this is on, you _must_ use escape sequences
				to cause a line break. Also, double spaces/tabs will be replaced
				with a single one, use a backslash-space sequence to force
				additional ones.
	
	TAKES:
		vString		-	A C String whose escape sequences are to be replaced.
		
	GIVES:
		vString		-	The contents of this string are converted in-place.
	
	REVISIONS:
		2002-03-27	UK	Added apostrophe escape sequence, changed \q to \" for ".
		2002-03-20	UK	Added double space/tab elimination and backslash-space.
		2001-06-29	UK	Created.
   ----------------------------------------------------------------------------- */

void	TalkTokenizer::ReplaceEscapeSequences( char* vString )
{
	short		x,
				vStringLength = strlen(vString);
	
	for( x = 0; vString[x] != 0; x++ )
	{
		switch( vString[x] )
		{
			case '\\':
				switch( tolower(vString[++x]) )	// tolower() so it's case-insensitive.
				{
					case 'n':
						vString[x-1] = TalkTokenizer::GetNewline();
						break;
					
					case 'l':
						vString[x-1] = '\n';
						break;
					
					case 'r':
						vString[x-1] = '\r';
						break;
					
					case 't':
						vString[x-1] = '\t';
						break;
					
					case '\"':
						vString[x-1] = '\"';
						break;
					
					case '\\':
						vString[x-1] = '\\';
						break;
					
					case ' ':
						vString[x-1] = ' ';
						break;
					
					case '\'':
						vString[x-1] = '\'';
						break;
					
					case 0:
						--x;	// Make sure we don't lose NULL char that terminates this string.
						break;
					
					default:
						vString[x-1] = ' ';
						break;
				}
				memmove( vString +x, vString +x+1, vStringLength -x );	// Move down following stuff, including terminating NULL.
				--x;	// We just shortened the string, fix x or we'd skip the next char.
				break;
			
			/* If there's already a return before this one, simply get rid of it.
				The newline/space/tab before this must have been set explicitly using
				an escape sequence (else this would have already handled it). If there
				is no return/space/tab before this, add one space. This way we get
				HTML-like elimination of leading tabs in lines and we can use return
				to break up a string so it fits into the edit window while still being
				able to explicitly cause returns using escape sequences: */
			case '\n':
			case '\r':
			case '\t':
			case ' ':
				if( x > 0 && vString[x-1] != '\n' && vString[x-1] != '\r' &&
					vString[x-1] != '\t' && vString[x-1] != ' ' ) 
				{
					vString[x] = ' ';
					++x;	// Move on, so this single space isn't eliminated.
				}
				while( (vString[x] == '\n' || vString[x] == '\r'
						|| vString[x] == ' ' || vString[x] == '\t') && vString[x+1] != 0 )
					memmove( vString +x, vString +x+1, vStringLength -x );	// Move down following stuff, including terminating NULL.
				--x;	// Step one back in case this character needs special treatment as well (e.g.backslash).
				break;
		}
	}
}


/* --------------------------------------------------------------------------------
	MakeNumberToken:
		Create a number token and append it to our list of tokens. This extracts
		the token into a string and converts it to a number (integer).
	
	TAKES:
		inScript	-	The script text being tokenized.
		start		-	The starting offset of this token in the script.
		end			-	The ending offset of this token in the script.
	
	REVISIONS:
		1999-06-22	UK	Created.
   ----------------------------------------------------------------------------- */

void	HyperTalk::MakeNumberToken( TextMunger& inScript, std::size_t start, std::size_t end )
{
	std::size_t	vStringLength = end -start;
	char*		vString;
	char*		vStrEnd = 0;
	long		vNumber;
	
	// Was end of token where no token was?
	if( vStringLength == 0 )
		return;	// Exit.
	
	vString = (char*) malloc( vStringLength +1 );
	if( vString )
	{
		std::size_t		vOldOffs;
		
		if( vStringLength != 0 )
		{
			vOldOffs = inScript.GetOffset();
			inScript.SetOffset( start );
			inScript.ReadData( vString, vStringLength );
			inScript.SetOffset( vOldOffs );
		}
		
		vString[vStringLength] = 0;	// Terminate string.
		vNumber = strtol( vString, &vStrEnd, 10 );	// Convert to number.
		
		AddToken( start, end, TOKEN_TYPE_INTEGER, vNumber );
		
		free( vString );
	}
	else
		throw parse_error( "Out of memory extracting number from script.", start, end );
}


/* --------------------------------------------------------------------------------
	AddToken:
		Create a token, append it to our token list and set its type and value to
		the ones passed. This also gets the offset to its text representation in
		the script which is used in error reporting. This is used by all
		MakeXXToken methods above.
	
	TAKES:
		start		-	The starting offset of this token in the script.
		end			-	The ending offset of this token in the script.
		tokenType	-	Type to assign to this token.
		tokenValue	-	Value to assign to this token, typecast to a long.
	
	REVISIONS:
		1999-06-22	UK	Created.
   ----------------------------------------------------------------------------- */

void	HyperTalk::AddToken( std::size_t start, std::size_t end, TokenTypeEnum tokenType,
							long tokenValue )
{
	HyperToken*		vToken;
	
	vToken = new HyperToken( start, end, tokenType, tokenValue );
	if( vToken )
		mTokens.insert( mTokens.end(), vToken );
	else
		throw parse_error( "Unable to add another token.", start, end );
}


/* --------------------------------------------------------------------------------
	PrintAllTokens:
		Loop over our token list and print each one to the console. This is for
		debugging the tokenizer.
	
	REVISIONS:
		1999-06-22	UK	Created.
   ----------------------------------------------------------------------------- */

void	HyperTalk::PrintAllTokens()
{
	HTLIterator		iterator;
	
	std::cout << "\n\nTokenized form of script:\n";
	
	for( iterator = mTokens.begin(); iterator != mTokens.end(); ++iterator )
	{
		HyperToken*		vToken = (*iterator);
		
		vToken->PrintToken();
	}
	
	std::cout << "\n";
}


/* --------------------------------------------------------------------------------
	RegisterIdentifierToken:
		Register an identifier with the tokenizer. One textual representation may
		only map to one token type, but you can have many textual representations
		map to the same token type. All tokens that are not operators or otherwise
		specially handled by the tokenizer (like strings or integers) are
		registered using this call.
	
	TAKES:
		vIdentifierName	-	The textual representation of the identifier.
		vTokenType		-	The token type for the token generated when
							encountering this textual representation.
	
	REVISIONS:
		2000-10-23	UK	Created.
   ----------------------------------------------------------------------------- */

void	TalkTokenizer::RegisterIdentifierToken( const TextMunger& vIdentifierName,
												TokenTypeEnum vTokenType )
{
	vIdentifierName.RecalcHash();	// Make sure compares are fast.
	mIdentifiers[vIdentifierName] = vTokenType;
	
	if( mTokenIDSeed <= vTokenType )
		mTokenIDSeed = vTokenType+1;
}


/* --------------------------------------------------------------------------------
	GetIdentifierType:
		Get an identifier's token type from our list. This is used by the tokenizer
		to look up identifier tokens based on their textual representation.
	
	TAKES:
		vIdentifierName	-	The textual representation of the identifier whose
							token type you want to look up.
	
	GIVES:
		TokenTypeEnum	-	The token type registered for this identifier, or
							TOKEN_TYPE_IDENTIFIER if no entry for this identifier
							was found.
	
	REVISIONS:
		2000-10-23	UK	Created.
   ----------------------------------------------------------------------------- */

TokenTypeEnum	TalkTokenizer::GetIdentifierType( TextMunger& vIdentifierName )
{
	TalkIdentifierList::iterator	itty;
	
	itty = mIdentifiers.find( vIdentifierName );
	if( itty == mIdentifiers.end() )	// None found?
	{
		return TOKEN_TYPE_IDENTIFIER;	// Say it's a generic identifier.
	}
	else
		return (*itty).second;
}


/* --------------------------------------------------------------------------------
	GetOperatorTokenKind:
		Return the operator type constant that equals the token passed in. If
		the token isn't an operator, this returns OPERATOR_CHAR_INVALID.
		
		This is used by expression parsing code to detect whether a token is an
		operator at all, and allows internally distinguishing between "is" and
		"=" while still being able to keep this token as an identifier token.
	
	TAKES:
		theToken	-	The token you suspect to be an operator.
	
	GIVES:
		OperatorTypeEnum	-	The operator type equivalent to this token, or
								OPERATOR_CHAR_INVALID if this isn't an operator.
	
	REVISIONS:
		2000-10-23	UK	Created.
   ----------------------------------------------------------------------------- */

OperatorTypeEnum	TalkTokenizer::GetOperatorTokenKind( HyperToken* theToken )
{
	switch( theToken->GetType() )
	{
		case TOKEN_TYPE_OPERATOR:
			return (OperatorTypeEnum) theToken->GetValue();
			break;
		
		case TOKEN_TYPE_IS_IDENTIFIER:
			return OPERATOR_CHAR_EQUAL;
			break;
		
		case TOKEN_TYPE_NOT_IDENTIFIER:
			return OPERATOR_CHAR_NOT;
			break;
		
		case TOKEN_TYPE_AND_IDENTIFIER:
			return OPERATOR_CHAR_AND;
			break;
		
		case TOKEN_TYPE_OR_IDENTIFIER:
			return OPERATOR_CHAR_OR;
			break;
	
		default:
			return OPERATOR_CHAR_INVALID;
	}
}


TokenTypeEnum	TalkTokenizer::GetOrCreateToken( const char* str )
{
	TextMunger		vMungie( (char*) str +1, strlen(str) );
	TokenTypeEnum	vNum;
	
	vMungie.RecalcHash();
	vNum = TalkTokenizer::GetIdentifierType( vMungie );
	if( vNum == TOKEN_TYPE_IDENTIFIER )	// Unknown? Make a new one!
	{
		vNum = TalkTokenizer::GetNewTokenID();
		TalkTokenizer::RegisterIdentifierToken( vMungie, vNum );
	}
	
	return vNum;
}