Extracting Tokens
Parser operates only with tokens – basic elements of the language. Tokens like identifiers and constants have also additional data structures where actual data is stored.
Type TToken = (
Token_EndOfLine,
Token_Identifier,
Token_Constant,
Token_Plus,
Token_Minus,
Token_Asterisk,
Token_Slash,
Token_Equal,
Token_NotEqual,
Token_Greater,
Token_Lower,
Token_GreaterOrEqual,
Token_LowerOrEqual,
Token_LeftParenthesis,
Token_RightParenthesis,
Token_LeftBracket,
Token_RightBracket,
Token_Period,
Token_Comma,
Token_Colon,
Token_Semicolon,
Token_Caret,
Token_Assignment,
Token_PeriodPeriod,
Token_At,
Token_INLINE_Separator,
Token_EqualAsSeparator,
Token_1B,
Token_AND,
Token_ARRAY,
Token_BEGIN,
Token_CASE,
Token_CONST,
Token_CONSTRUCTOR,
Token_DESTRUCTOR,
Token_DIV,
Token_DO,
Token_DOWNTO,
Token_ELSE,
Token_END,
Token_ASM,
Token_FILE,
Token_FOR,
Token_2B,
Token_FUNCTION,
Token_GOTO,
Token_IF,
Token_IMPLEMENTATION,
Token_IN,
Token_INLINE,
Token_INTERFACE,
Token_33,
Token_LABEL,
Token_MOD,
Token_NIL,
Token_NOT,
Token_OBJECT,
Token_OF,
Token_OR,
Token_PACKED,
Token_PROCEDURE,
Token_PROGRAM,
Token_RECORD,
Token_REPEAT,
Token_SET,
Token_SHL,
Token_SHR,
Token_STRING,
Token_THEN,
Token_TYPE,
Token_TO,
Token_UNIT,
Token_UNTIL,
Token_USES,
Token_VAR,
Token_INHERITED,
Token_WHILE,
Token_WITH,
Token_XOR,
Token_ConstantIdentifier,
Token_TypeIdentifier,
Token_VariableIdentifier,
Token_ProcedureIdentifier,
Token_UnitIdentifier,
Token_OBJECT_PROCEDURE_FUNCTION_TypeIdentifier,
Token_LabelIdentifier,
Token_SystemProcedure,
Token_SystemFunction,
Token_SystemFunctionOrProcedure,
Token_Port,
Token_Mem,
Token_OpenString,
Token_AsmSegmentReference,
Token_AsmOffsetReference,
Token_5E,
Token_5F,
Token_60,
Token_61,
Token_62,
Token_63,
Token_ABSOLUTE,
Token_PUBLIC,
Token_PRIVATE);
This is one of the most important procedures in Turbo Pascal. It reads source file by calling NextNonCommentCharacter function and extracts tokens with associated data. This procedure identifies reserved words, string constants, integer constants, real constants, symbols, identifiers and other Pascal language elements.
Procedure GetNextToken;
Var NextChar, TempPtr: PChar;
Procedure CheckIfIdentifierOrReservedWord;
Var IdLen: Byte;
DummyPtr: Pointer;
DummyWord: Word;
begin
IdLen := 0;
CurrentIdentifierHash := 0;
While NextChar^ in ['0'..'9', '_', 'A'..'Z', 'a'..'z'] do
begin
If IdLen < High (CurrentIdentifier) then
begin
Inc (IdLen);
CurrentIdentifier [IdLen] := NextChar^;
Inc (CurrentIdentifierHash, Byte (NextChar^) and $DF);
end;
Inc (NextChar);
end;
CurrentIdentifier [0] := Char (IdLen);
CurrentIdentifierHash := (CurrentIdentifierHash - IdLen) shl 1;
If not IsIdentifierInSymbolTable (@ReservedWords, Token, DummyPtr, DummyWord) then Token := Token_Identifier;
end;
Function IsRealConstant: Boolean;
Var Word64Lo, Word64Hi, TempWord64Lo, TempWord64Hi: LongRec;
MantissaExponent, MantissaExponentIncrement, Exponent: Word;
ExponentSign: Char;
SavedFirstChar: PChar;
Err: Integer;
RealStr: String [128];
begin
IsRealConstant := False;
SavedFirstChar := NextChar;
Word64Lo.Long := 0;
Word64Hi.Long := 0;
MantissaExponent := 0;
MantissaExponentIncrement := 0;
Repeat
Case NextChar^ of
'0'..'9': begin
If Word64Hi.WordH < $0CCC then
begin
{ Word64 := Word64 * 10 + Digit }
Inc (MantissaExponent, MantissaExponentIncrement);
end else Inc (MantissaExponent, MantissaExponentIncrement + 1);
end;
'.': If MantissaExponentIncrement = 0 then Dec (MantissaExponentIncrement)
else Break;
end;
Inc (NextChar);
until not (NextChar^ in ['0'..'9', '.']);
TempWord64Lo := Word64Lo;
TempWord64Hi := Word64Hi;
If UpCase (NextChar^) = 'E' then
begin
Inc (NextChar);
ExponentSign := NextChar^;
If NextChar^ in ['+', '-'] then Inc (NextChar);
Exponent := 0;
While NextChar^ in ['0'..'9'] do
If Exponent < 1000 then { If exponent is greater than 9999 TPC evaluates up to 4 digits of }
begin { the exponent and then takes the next digit for next token. }
Exponent := 10 * Exponent + Byte (NextChar [0]) - Ord ('0');
Inc (NextChar);
end else Exit; { BUG in TPC: 1e-4932 is not calculated correctly }
If ExponentSign = '-' then Exponent := MantissaExponent - Exponent
else Exponent := MantissaExponent + Exponent;
end;
RealStr [0] := Char (NextChar - SavedFirstChar);
Move (SavedFirstChar^, RealStr [1], Byte (RealStr [0]));
Val (RealStr, NumericConstant.Extended, Err);
IsRealConstant := Err = 0;
end;
Procedure CheckRealConstant;
begin
If not IsRealConstant then
begin
ErrorSourcePosition := NextChar;
Error (ErrorInRealConstant);
end;
ConstantTypeDef := Ptr (SystemUnitSegment, Extended_TypeOffset);
Token := Token_Constant;
end;
Function CheckIntegerNumericConstant (Var NextChar: PChar): TToken;
begin
If not IsIntegerNumericConstant (NextChar) then
begin
ErrorSourcePosition := NextChar;
Error (ErrorInIntegerConstant);
end;
NumericConstant.LongInt := IntegerNumericConstant;
ConstantTypeDef := Ptr (SystemUnitSegment, LongInt_TypeOffset);
CheckIntegerNumericConstant := Token_Constant;
end;
begin
CheckStack;
NextChar := NextNonCommentCharacter;
ErrorSourcePosition := NextChar;
Case NextChar [0] of
#0: begin
Token := Token_EndOfLine;
Inc (NextChar);
end;
'!', '"',
'%', '&',
'('..'/',
':'..'@',
'['..'^',
'`': begin
Case NextChar [0] of
'!',
'"',
'%',
'&',
'?',
'\',
'`': Error (SyntaxError);
'(': If NextChar [1] = '.' then
begin
Token := Token_LeftBracket;
Inc (NextChar);
end else Token := Token_LeftParenthesis;
')': Token := Token_RightParenthesis;
'*': Token := Token_Asterisk;
'+': Token := Token_Plus;
',': Token := Token_Comma;
'-': Token := Token_Minus;
'.': Case NextChar [1] of
'.': begin
Token := Token_PeriodPeriod;
Inc (NextChar);
end;
')': begin
Token := Token_RightBracket;
Inc (NextChar);
end;
else Token := Token_Period;
end;
'/': Token := TokenForSlash;
':': If NextChar [1] = '=' then
begin
Token := Token_Assignment;
Inc (NextChar);
end else Token := Token_Colon;
';': Token := Token_Semicolon;
'<': Case NextChar [1] of
'>': begin
Token := Token_NotEqual;
Inc (NextChar);
end;
'=': begin
Token := Token_LowerOrEqual;
Inc (NextChar);
end;
else Token := Token_Lower;
end;
'=': Token := TokenForEqual;
'>': If NextChar [1] = '=' then
begin
Token := Token_GreaterOrEqual;
Inc (NextChar);
end else Token := Token_Greater;
'@': Token := Token_At;
'[': Token := Token_LeftBracket;
']': Token := Token_RightBracket;
'^': Token := Token_Caret;
end;
Inc (NextChar);
end;
'#',
'''': Token := ProcessStringConstant (NextChar);
'$': Token := CheckIntegerNumericConstant (NextChar);
'0'..'9': begin
TempPtr := NextChar;
Repeat
Inc (TempPtr);
until not (TempPtr [0] in ['0'..'9']);
Case UpCase (TempPtr [0]) of
'E': CheckRealConstant;
'.': Case TempPtr [1] of
'.',
')': Token := CheckIntegerNumericConstant (NextChar);
else CheckRealConstant;
end;
else Token := CheckIntegerNumericConstant (NextChar);
end;
end;
'A'..'Z',
'a'..'z',
'_': CheckIfIdentifierOrReservedWord;
else Error (SyntaxError);
end;
CurrentSourceFile^.CurrentPosition := Ofs (NextChar^);
end;
This function expects specified token and gets the next one or reports error if the expected token is not found.
Procedure ExpectTokenAndGetNext (ExpectedToken: TToken);
Const SpecialErrors = 22;
ExpectedTokenErrors: Array [1..SpecialErrors] of TExpectedTokenError = (
(Token: Token_Identifier; Error: IdentifierExpected),
(Token: Token_LabelIdentifier; Error: LabelIdentifierExpected),
(Token: Token_BEGIN; Error: BEGIN_expected),
(Token: Token_END; Error: END_expected),
(Token: Token_DO; Error: DO_expected),
(Token: Token_OF; Error: OF_expected),
(Token: Token_INTERFACE; Error: INTERFACE_expected),
(Token: Token_THEN; Error: THEN_expected),
(Token: Token_IMPLEMENTATION; Error: IMPLEMENTATION_expected),
(Token: Token_UNIT; Error: UNIT_expected),
(Token: Token_Semicolon; Error: SemicolonExpected),
(Token: Token_Colon; Error: ColonExpected),
(Token: Token_Comma; Error: CommaExpected),
(Token: Token_LeftParenthesis; Error: LeftParenthesisExpected),
(Token: Token_RightParenthesis; Error: RightParenthesisExpected),
(Token: Token_Equal; Error: EqualExpected),
(Token: Token_EqualAsSeparator; Error: EqualExpected),
(Token: Token_Assignment; Error: AssignmentExpected),
(Token: Token_LeftBracket; Error: LeftBracketExpected),
(Token: Token_RightBracket; Error: RightBracketExpected),
(Token: Token_Period; Error: PeriodExpected),
(Token: Token_PeriodPeriod; Error: PeriodPeriodExpected));
Var N: Byte;
begin
If ExpectedToken <> Token then
begin
For N := 1 to SpecialErrors do
If ExpectedTokenErrors [N].Token = ExpectedToken then Error (ExpectedTokenErrors [N].Error);
Error (SyntaxError);
end;
GetNextToken;
end;
Another function that checks for specified token but doesn’t report error if it is not found.
Function CheckAndGetNextToken (TokenToCheck: TToken): Boolean;
begin
CheckAndGetNextToken := False;
If Token <> TokenToCheck then Exit;
CheckAndGetNextToken := True;
GetNextToken;
end;