FoxScanner.Mod 51 KB


  1. MODULE FoxScanner; (** AUTHOR "fof & fn"; PURPOSE "Oberon Compiler: Scanner"; **)
  2. (* (c) fof ETH Zürich, 2009 *)
  3. IMPORT Streams, Strings, Diagnostics, Basic := FoxBasic, D := Debugging, Commands, StringPool;
  4. CONST
  5. Trace = FALSE; (* debugging output *)
  6. (* overal scanner limitation *)
  7. MaxIdentifierLength* = 128;
  8. (* parametrization of numeric scanner: *)
  9. MaxHexDigits* = 8; (* maximal hexadecimal longint length *)
  10. MaxHugeHexDigits* = 16; (* maximal hexadecimal hugeint length *)
  11. MaxRealExponent* = 38; (* maximal real exponent *)
  12. MaxLongrealExponent* = 308; (* maximal longreal exponent *)
  13. (* scanner constants *)
  14. EOT* = 0X; LF* = 0AX; CR* = 0DX; TAB* = 09X; ESC* = 1BX;
  15. TYPE
  16. StringType* = Strings.String;
  17. IdentifierType* = StringPool.Index;
  18. IdentifierString*= ARRAY MaxIdentifierLength+1 OF CHAR;
  19. CONST
  20. (** symbols *)
  21. (*
  22. note: order of symbols is important for the parser, do not modify without looking it up
  23. FoxProgTools.Enum --export --linefeed=6
  24. None
  25. (* RelationOps: Equal ... Is *)
  26. Equal DotEqual Unequal DotUnequal
  27. Less DotLess LessEqual DotLessEqual Greater DotGreater GreaterEqual DotGreaterEqual
  28. LessLessQ GreaterGreaterQ Questionmarks ExclamationMarks
  29. In Is
  30. (* MulOps: Times ... And *)
  31. Times TimesTimes DotTimes PlusTimes Slash Backslash DotSlash Div Mod And
  32. (* AddOps: Or ... Minus *)
  33. Or Plus Minus
  34. (* Prefix Unary Operators Plus ... Not *)
  35. Not
  36. (* expressions may start with Plus ... Identifier *)
  37. LeftParenthesis LeftBracket LeftBrace Number Character String Nil Imag True False Self Result New Identifier
  38. (* statementy may start with Self ... Begin *)
  39. If Case While Repeat For Loop With Exit Await Return Ignore Begin
  40. (* symbols, expressions and statements cannot start with *)
  41. Semicolon Transpose RightBrace RightBracket RightParenthesis
  42. Questionmark ExclamationMark
  43. LessLess GreaterGreater
  44. Upto Arrow Period Comma Colon Of Then Do To By Becomes Bar End Else Elsif Until Finally
  45. (* declaration elements *)
  46. Code Const Type Var Out Procedure Operator Import Definition Module Cell CellNet Extern
  47. (* composite type symbols *)
  48. Array Object Record Pointer Enum Port Address Size Alias
  49. (* assembler constants *)
  50. Ln PC PCOffset
  51. (* number types *)
  52. Shortint Integer Longint Hugeint Real Longreal
  53. Comment EndOfText Escape
  54. ~
  55. *)
  56. None*= 0;
  57. (* RelationOps: Equal ... Is *)
  58. Equal*= 1; DotEqual*= 2; Unequal*= 3; DotUnequal*= 4; Less*= 5; DotLess*= 6;
  59. LessEqual*= 7; DotLessEqual*= 8; Greater*= 9; DotGreater*= 10; GreaterEqual*= 11; DotGreaterEqual*= 12;
  60. LessLessQ*= 13; GreaterGreaterQ*= 14; Questionmarks*= 15; ExclamationMarks*= 16; In*= 17; Is*= 18;
  61. (* MulOps: Times ... And *)
  62. Times*= 19; TimesTimes*= 20; DotTimes*= 21; PlusTimes*= 22; Slash*= 23; Backslash*= 24;
  63. DotSlash*= 25; Div*= 26; Mod*= 27; And*= 28;
  64. (* AddOps: Or ... Minus *)
  65. Or*= 29; Plus*= 30; Minus*= 31;
  66. (* Prefix Unary Operators Plus ... Not *)
  67. Not*= 32;
  68. (* expressions may start with Plus ... Identifier *)
  69. LeftParenthesis*= 33; LeftBracket*= 34; LeftBrace*= 35; Number*= 36; Character*= 37; String*= 38;
  70. Nil*= 39; Imag*= 40; True*= 41; False*= 42; Self*= 43; Result*= 44;
  71. New*= 45; Identifier*= 46;
  72. (* statementy may start with Self ... Begin *)
  73. If*= 47; Case*= 48; While*= 49; Repeat*= 50; For*= 51; Loop*= 52;
  74. With*= 53; Exit*= 54; Await*= 55; Return*= 56; Ignore*= 57; Begin*= 58;
  75. (* symbols, expressions and statements cannot start with *)
  76. Semicolon*= 59; Transpose*= 60; RightBrace*= 61; RightBracket*= 62; RightParenthesis*= 63; Questionmark*= 64;
  77. ExclamationMark*= 65; LessLess*= 66; GreaterGreater*= 67; Upto*= 68; Arrow*= 69; Period*= 70;
  78. Comma*= 71; Colon*= 72; Of*= 73; Then*= 74; Do*= 75; To*= 76;
  79. By*= 77; Becomes*= 78; Bar*= 79; End*= 80; Else*= 81; Elsif*= 82;
  80. Until*= 83; Finally*= 84;
  81. (* declaration elements *)
  82. Code*= 85; Const*= 86; Type*= 87; Var*= 88; Out*= 89; Procedure*= 90;
  83. Operator*= 91; Import*= 92; Definition*= 93; Module*= 94; Cell*= 95; CellNet*= 96;
  84. Extern*= 97;
  85. (* composite type symbols *)
  86. Array*= 98; Object*= 99; Record*= 100; Pointer*= 101; Enum*= 102; Port*= 103;
  87. Address*= 104; Size*= 105; Alias*= 106;
  88. (* assembler constants *)
  89. Ln*= 107; PC*= 108; PCOffset*= 109;
  90. (* number types *)
  91. Shortint*= 110; Integer*= 111; Longint*= 112; Hugeint*= 113; Real*= 114; Longreal*= 115;
  92. Comment*= 116; EndOfText*= 117; Escape*= 118;
  93. SingleQuote = 27X; DoubleQuote* = 22X;
  94. Ellipsis = 7FX; (* used in Scanner.GetNumber to return with ".." when reading an interval like 3..5 *)
  95. Uppercase*=0;
  96. Lowercase*=1;
  97. Unknown*=2;
  98. TYPE
  99. (* keywords book keeping *)
  100. Keyword* = ARRAY 32 OF CHAR;
  101. KeywordTable* = OBJECT(Basic.HashTableInt); (* string -> index *)
  102. VAR table: POINTER TO ARRAY OF LONGINT;
  103. PROCEDURE &InitTable*(size: LONGINT);
  104. VAR i: LONGINT;
  105. BEGIN
  106. Init(size); NEW(table,size); FOR i := 0 TO size-1 DO table[i] := -1; END;
  107. END InitTable;
  108. PROCEDURE IndexByIdentifier*(identifier: IdentifierType): LONGINT;
  109. BEGIN
  110. IF Has(identifier) THEN
  111. RETURN GetInt(identifier)
  112. ELSE (* do not modify index *)
  113. RETURN -1
  114. END;
  115. END IndexByIdentifier;
  116. PROCEDURE IndexByString*(CONST name: ARRAY OF CHAR): LONGINT;
  117. VAR stringPoolIndex: LONGINT;
  118. BEGIN
  119. StringPool.GetIndex(name,stringPoolIndex);
  120. IF Has(stringPoolIndex) THEN
  121. RETURN GetInt(stringPoolIndex)
  122. ELSE (* do not modify index *)
  123. RETURN -1
  124. END;
  125. END IndexByString;
  126. PROCEDURE IdentifierByIndex*(index: LONGINT; VAR identifier: IdentifierType);
  127. BEGIN
  128. identifier := table[index]
  129. END IdentifierByIndex;
  130. PROCEDURE StringByIndex*(index: LONGINT; VAR name: ARRAY OF CHAR);
  131. VAR stringPoolIndex: LONGINT;
  132. BEGIN
  133. stringPoolIndex := table[index];
  134. IF stringPoolIndex < 0 THEN
  135. name := ""
  136. ELSE
  137. StringPool.GetString(stringPoolIndex,name);
  138. END;
  139. END StringByIndex;
  140. PROCEDURE PutString*(CONST name: ARRAY OF CHAR; index: LONGINT);
  141. VAR stringPoolIndex: LONGINT;
  142. BEGIN
  143. StringPool.GetIndex(name,stringPoolIndex);
  144. table[index] := stringPoolIndex;
  145. PutInt(stringPoolIndex,index);
  146. END PutString;
  147. END KeywordTable;
  148. TYPE
  149. Symbol*=LONGINT;
  150. Position*= Basic.Position;
  151. (**
  152. token: data structure for the data transfer of the last read input from the scanner to the parser
  153. **)
  154. Token*= RECORD
  155. position*: Position;
  156. symbol*: Symbol; (* symbol of token *)
  157. identifier*: IdentifierType; (* identifier *)
  158. identifierString*: IdentifierString; (* cache of identifier's string *)
  159. string*: StringType; (* string or identifier *)
  160. stringLength*: LONGINT; (* length of string, if stringLength = 2 then this may be interpreted as character and integer = ORD(ch) *)
  161. numberType*: LONGINT; (* Integer, HugeInteger, Real or Longreal *)
  162. integer*: LONGINT;
  163. hugeint*: HUGEINT; (*! unify longint and hugeint *)
  164. character*: CHAR;
  165. real*: LONGREAL;
  166. END;
  167. StringMaker* = OBJECT (* taken from TF's scanner *)
  168. VAR length : LONGINT;
  169. data : StringType;
  170. PROCEDURE &Init*(initialSize : LONGINT);
  171. BEGIN
  172. IF initialSize < 256 THEN initialSize := 256 END;
  173. NEW(data, initialSize); length := 0;
  174. END Init;
  175. PROCEDURE Add*(CONST buf: ARRAY OF CHAR; ofs, len: LONGINT; propagate: BOOLEAN; VAR res: WORD);
  176. VAR i : LONGINT; n: StringType;
  177. BEGIN
  178. IF length + len + 1 >= LEN(data) THEN
  179. NEW(n, LEN(data) + len + 1); FOR i := 0 TO length - 1 DO n[i] := data[i] END;
  180. data := n
  181. END;
  182. WHILE len > 0 DO
  183. data[length] := buf[ofs];
  184. INC(ofs); INC(length); DEC(len)
  185. END;
  186. data[length] := 0X;
  187. END Add;
  188. (* remove last n characters *)
  189. PROCEDURE Shorten*(n : LONGINT);
  190. BEGIN
  191. DEC(length, n);
  192. IF length < 0 THEN length := 0 END;
  193. IF length > 0 THEN data[length - 1] := 0X ELSE data[length] := 0X END
  194. END Shorten;
  195. PROCEDURE Clear*;
  196. BEGIN
  197. data[0] := 0X;
  198. length := 0
  199. END Clear;
  200. PROCEDURE GetWriter*() : Streams.Writer;
  201. VAR w : Streams.Writer;
  202. BEGIN
  203. NEW(w, SELF.Add, 256);
  204. RETURN w
  205. END GetWriter;
  206. PROCEDURE GetReader*(): Streams.Reader;
  207. VAR r: Streams.StringReader;
  208. BEGIN
  209. NEW(r, 256);
  210. r.Set(data^);
  211. RETURN r
  212. END GetReader;
  213. PROCEDURE GetString*(VAR len: LONGINT) : StringType;
  214. BEGIN
  215. len := length;
  216. RETURN data
  217. END GetString;
  218. PROCEDURE GetStringCopy*(VAR len: LONGINT): StringType;
  219. VAR new: StringType;
  220. BEGIN
  221. len := length;
  222. NEW(new,len+1);
  223. COPY(data^,new^);
  224. RETURN new
  225. END GetStringCopy;
  226. END StringMaker;
  227. (** scanner reflects the following EBNF
  228. Token = String | Symbol | Number | Keyword | Identifier.
  229. Symbol = | '#' | '&' | '(' ['*' any '*' ')'] | ')' | '*'['*'] | '+'['*'] | ',' | '-' | '.' [ '.' | '*' | '/' | '=' | '#' | '>'['='] | '<' ['=']
  230. | '/' | ':' ['='] | ';' | '<' ['=' | '<' ['?'] ] | '=' | '>' [ '=' | '>' ['?']]
  231. | '[' | ']' | '^' | '{' | '|' | '}' | '~' | '\' | '`' | '?' ['?'] | '!' ['!']
  232. Identifier = Letter {Letter | Digit | '_'}.
  233. Letter = 'A' | 'B' | .. | 'Z' | 'a' | 'b' | .. | 'z'.
  234. Digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' .
  235. String = '"' {Character} '"' | "'" {Character} "'".
  236. Character = Digit [HexDigit] 'X'.
  237. Number = Integer | Real.
  238. Integer = Digit {Digit} | Digit {HexDigit} 'H' | '0x' {HexDigit}.
  239. Real = Digit {Digit} '.' {Digit} [ScaleFactor].
  240. ScaleFactor = ('E' | 'D') ['+' | '-'] digit {digit}.
  241. HexDigit = Digit | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'.
  242. **)
  243. Scanner* = OBJECT
  244. VAR
  245. (* helper state information *)
  246. source-: StringType;
  247. reader-: Streams.Reader; (* source *)
  248. diagnostics: Diagnostics.Diagnostics; (* error logging *)
  249. ch-: CHAR; (* look-ahead character *)
  250. position-: Position; (* current position *)
  251. error-: BOOLEAN; (* if error occured during scanning *)
  252. firstIdentifier: BOOLEAN; (* support of lower vs. upper case keywords *)
  253. case-: LONGINT;
  254. stringWriter: Streams.Writer;
  255. stringMaker: StringMaker;
  256. useLineNumbers*: BOOLEAN;
  257. (*
  258. source: name of the source code for reference in error outputs
  259. reader: input stream
  260. position: reference position (offset) of the input stream , for error output
  261. diagnostics: error output object
  262. *)
  263. PROCEDURE & InitializeScanner*( CONST source: ARRAY OF CHAR; reader: Streams.Reader; pos: Position; diagnostics: Diagnostics.Diagnostics );
  264. BEGIN
  265. NEW(stringMaker,1024);
  266. stringWriter := stringMaker.GetWriter();
  267. error := FALSE;
  268. NEW(SELF.source, Strings.Length(source)+1);
  269. COPY (source, SELF.source^);
  270. SELF.reader := reader;
  271. SELF.diagnostics := diagnostics;
  272. ch := " ";
  273. case := Unknown;
  274. firstIdentifier := TRUE;
  275. SELF.position := pos;
  276. DEC(position.start, 1); (* one token lookahead *)
  277. IF reader = NIL THEN ch := EOT ELSE GetNextCharacter END;
  278. IF Trace THEN D.Str( "New scanner " ); D.Ln; END;
  279. IF source = "" THEN SELF.position.reader := reader END;
  280. useLineNumbers := FALSE;
  281. END InitializeScanner;
  282. PROCEDURE ResetCase*; (*! needs a better naming ! *)
  283. BEGIN
  284. firstIdentifier := TRUE; case := Unknown;
  285. END ResetCase;
  286. PROCEDURE SetCase*(c: LONGINT);
  287. BEGIN
  288. case := c;
  289. END SetCase;
  290. (** report an error occured during scanning **)
  291. PROCEDURE ErrorS(CONST msg: ARRAY OF CHAR);
  292. BEGIN
  293. Basic.Error(diagnostics, source^, position, msg);
  294. error := TRUE;
  295. END ErrorS;
  296. (** report an error occured during scanning **)
  297. PROCEDURE Error( code: INTEGER );
  298. BEGIN
  299. Basic.ErrorC(diagnostics, source^, position, code, "");
  300. error := TRUE;
  301. END Error;
  302. (** get next character, end of text results in ch = EOT **)
  303. PROCEDURE GetNextCharacter*;
  304. BEGIN
  305. reader.Char(ch); INC(position.start);
  306. IF ch = LF THEN INC(position.line); position.linepos := position.start+1 END;
  307. (*
  308. (* not necessary, as Streams returns 0X if reading failed, but in case Streams.Reader.Char is modified ... *)
  309. IF reader.res # Streams.Ok THEN ch := EOT END;
  310. *)
  311. END GetNextCharacter;
  312. (*
  313. The following is an implementation of the KMP algorithm used in order to traverse strings until some pattern occurs.
  314. It is not necessary for our implementation of string escape sequences, because the first character of the pattern does not occur in the pattern elsewhere
  315. I found the code useful and keep it here for the time being....
  316. (* generate a table to be able to quickly search for string containing overlaps - KMP algorithm *)
  317. PROCEDURE MakeOverlapTable*(CONST pattern: ARRAY OF CHAR; VAR table: ARRAY OF LONGINT);
  318. VAR i, cnd: LONGINT;
  319. BEGIN
  320. ASSERT(pattern[0] # 0X);
  321. (* if first character did not match: reset search *)
  322. table[0] := -1;
  323. (* if second character did not match: compare to first *)
  324. IF pattern[1] # 0X THEN
  325. table[1] := 0;
  326. END;
  327. (* for all other characters: switch back to previous overlay in pattern *)
  328. i := 2; cnd := 0;
  329. WHILE(pattern[i] # 0X) DO
  330. (* do patterns [i-cnd, i-1] match with pattern[0.. cnd] ? *)
  331. IF pattern[i-1] = pattern[cnd] THEN
  332. INC(cnd); table[i] := cnd; INC(i);
  333. (* no, switch back to last overlap, if possible *)
  334. ELSIF cnd > 0 THEN cnd := table[cnd]
  335. (* not possible: restart at beginning *)
  336. ELSE table[i] := 0; INC(i)
  337. END;
  338. END;
  339. END MakeOverlapTable;
  340. (* using KMP substring search algorithm consume and reproduce all characters of a string until endString *)
  341. PROCEDURE GetString(CONST endString: ARRAY OF CHAR);
  342. VAR escapePos: LONGINT; ech: CHAR; i: LONGINT; table: ARRAY 16 OF LONGINT;
  343. next: LONGINT;
  344. PROCEDURE Append(ch :CHAR);
  345. BEGIN
  346. IF ch = 0X THEN
  347. ErrorS("Unexpected end of text in string"); error := TRUE
  348. ELSE
  349. stringWriter.Char(ch)
  350. END;
  351. END Append;
  352. BEGIN
  353. MakeOverlapTable(endString, table);
  354. (* traverse *)
  355. escapePos := 0; ech := endString[0];
  356. GetNextCharacter;
  357. REPEAT
  358. IF ch = ech THEN
  359. INC(escapePos); ech := endString[escapePos];
  360. GetNextCharacter;
  361. ELSIF escapePos = 0 THEN (* frequent case *)
  362. Append(ch); GetNextCharacter;
  363. ELSE
  364. (* overlaps ? *)
  365. next := table[escapePos];
  366. IF next < 0 THEN next := 0 END;
  367. (* account for "forgotten" characters *)
  368. FOR i := 0 TO escapePos-1-next DO
  369. Append(endString[i]);
  370. END;
  371. (* to next overlapping ? *)
  372. escapePos := table[escapePos];
  373. (* no overlapping *)
  374. IF escapePos < 0 THEN
  375. Append(ch);
  376. escapePos := 0;
  377. GetNextCharacter;
  378. END;
  379. ech := endString[escapePos];
  380. END;
  381. UNTIL (ch = EOT) OR (ech = 0X);
  382. END GetString;
  383. *)
  384. (* simple case can be utilized when endString does not contain first character, which is the case for our string convention *)
  385. PROCEDURE ConsumeStringUntil(CONST endString: ARRAY OF CHAR; useControl: BOOLEAN);
  386. VAR escapePos: LONGINT; ech: CHAR; i: LONGINT; startPosition: LONGINT;
  387. CONST
  388. Control = '\';
  389. Delimiter = '"';
  390. PROCEDURE Append(ch :CHAR);
  391. BEGIN
  392. IF ch = 0X THEN
  393. ErrorS("Unexpected end of text in string"); error := TRUE;
  394. ELSE
  395. stringWriter.Char(ch)
  396. END;
  397. END Append;
  398. BEGIN
  399. (* traverse *)
  400. escapePos := 0; ech := endString[0]; startPosition := position.start;
  401. GetNextCharacter;
  402. REPEAT
  403. IF ch = ech THEN
  404. INC(escapePos); ech := endString[escapePos];
  405. GetNextCharacter;
  406. ELSIF useControl & (ch = Control) THEN
  407. GetNextCharacter;
  408. IF (ch = Control) OR (ch = Delimiter) THEN
  409. Append(ch)
  410. ELSIF ch = 'n' THEN
  411. Append(CR); Append(LF);
  412. ELSIF ch = 't' THEN
  413. Append(TAB)
  414. ELSE
  415. ErrorS("Unknown control sequence")
  416. END;
  417. GetNextCharacter
  418. ELSIF escapePos = 0 THEN (* frequent case *)
  419. Append(ch); GetNextCharacter;
  420. ELSE
  421. (* account for "forgotten" characters *)
  422. FOR i := 0 TO escapePos-1 DO
  423. Append(endString[i]);
  424. END;
  425. (* restart *)
  426. ech := endString[0]; escapePos := 0;
  427. END;
  428. UNTIL (ch = EOT) OR (ech = 0X) OR error;
  429. IF ch = EOT THEN position.start := startPosition; ErrorS("Unexpected end of text in string") END;
  430. END ConsumeStringUntil;
  431. PROCEDURE GetEscapedString(VAR token: Token);
  432. VAR endString: ARRAY 4 OF CHAR; escape: CHAR;
  433. BEGIN
  434. (* backslash already consumed *)
  435. stringMaker.Clear;
  436. IF ch = '"' THEN
  437. escape := 0X;
  438. ELSE
  439. escape := ch; GetNextCharacter;
  440. END;
  441. ASSERT((ch = '"') OR (ch = "'"));
  442. REPEAT
  443. IF escape # 0X THEN
  444. endString[0] := ch;
  445. endString[1] := escape;
  446. endString[2] := '\';
  447. endString[3] := 0X;
  448. ELSE
  449. endString[0] := ch;
  450. endString[1] := '\';
  451. endString[2] := 0X;
  452. END;
  453. ConsumeStringUntil(endString, escape = 0X);
  454. UNTIL TRUE;
  455. stringWriter.Char(0X);
  456. stringWriter.Update;
  457. token.string := stringMaker.GetStringCopy(token.stringLength);
  458. END GetEscapedString;
  459. (** get a string starting at current position
  460. string = {'"' {Character} '"'} | {"'" {Character} "'"}.
  461. **)
  462. (* multiline indicates that a string may occupy more than one lines, either concatenated or via multi-strings " " " "
  463. *)
  464. PROCEDURE GetString(VAR token: Token; multiLine, multiString, useControl: BOOLEAN);
  465. VAR och: CHAR; error: BOOLEAN;
  466. CONST control = '\';
  467. PROCEDURE Append(ch :CHAR);
  468. BEGIN
  469. IF ch = 0X THEN
  470. ErrorS("Unexpected end of text in string"); error := TRUE
  471. ELSE
  472. stringWriter.Char(ch)
  473. END;
  474. END Append;
  475. BEGIN
  476. stringMaker.Clear;
  477. och := ch; error := FALSE;
  478. REPEAT
  479. LOOP
  480. IF error THEN EXIT END;
  481. GetNextCharacter;
  482. IF (ch = och) OR (ch = EOT) THEN EXIT END;
  483. IF useControl & (ch = control) THEN
  484. GetNextCharacter;
  485. IF (ch = control) OR (ch = och) THEN
  486. Append(ch)
  487. ELSIF ch = 'n' THEN
  488. Append(CR); Append(LF);
  489. ELSIF ch = 't' THEN
  490. Append(TAB)
  491. ELSE
  492. ErrorS("Unknown control sequence")
  493. END;
  494. ELSE
  495. IF ~multiLine & (ch < " ") THEN Error( Basic.StringIllegalCharacter ); EXIT END;
  496. Append(ch)
  497. END;
  498. END;
  499. IF ch = EOT THEN
  500. ErrorS("Unexpected end of text in string")
  501. ELSE
  502. GetNextCharacter;
  503. IF multiString THEN SkipBlanks END;
  504. END;
  505. UNTIL ~multiString OR (ch # och);
  506. stringWriter.Char(0X);
  507. stringWriter.Update;
  508. token.string := stringMaker.GetStringCopy(token.stringLength);
  509. END GetString;
  510. (**
  511. Identifier = Letter {Letter | Digit | '_'} .
  512. Letter = 'A' | 'B' | .. | 'Z' | 'a' | 'b' | .. | 'z' .
  513. Digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'.
  514. '_' is the underscore character
  515. **)
  516. PROCEDURE GetIdentifier( VAR token: Token );
  517. VAR i: LONGINT;
  518. BEGIN
  519. i := 0;
  520. REPEAT token.identifierString[i] := ch; INC( i ); GetNextCharacter UNTIL reservedCharacter[ORD( ch )] OR (i = MaxIdentifierLength);
  521. IF i = MaxIdentifierLength THEN Error( Basic.IdentifierTooLong ); DEC( i ) END;
  522. token.identifierString[i] := 0X;
  523. StringPool.GetIndex(token.identifierString, token.identifier);
  524. END GetIdentifier;
  525. (**
  526. Number = Integer | Real.
  527. Integer = Digit {Digit} | Digit {HexDigit} 'H' | '0x' {HexDigit}.
  528. Real = Digit {Digit} '.' {Digit} [ScaleFactor].
  529. ScaleFactor = ('E' | 'D') ['+' | '-'] digit {digit}.
  530. HexDigit = Digit | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'.
  531. Digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9' .
  532. **)
  533. PROCEDURE GetNumber(VAR token: Token): Symbol;
  534. VAR i, nextInt, m, n, d, e, si: LONGINT;
  535. dig: ARRAY 24 OF CHAR;
  536. f: LONGREAL; expCh: CHAR; neg, long: BOOLEAN;
  537. result: Symbol;
  538. hugeint, tenh, number: HUGEINT;
  539. digits: LONGINT;
  540. (** 10^e **)
  541. PROCEDURE Ten( e: LONGINT ): LONGREAL;
  542. VAR x, p: LONGREAL;
  543. BEGIN
  544. x := 1; p := 10;
  545. WHILE e > 0 DO
  546. IF ODD( e ) THEN x := x * p END;
  547. e := e DIV 2;
  548. IF e > 0 THEN p := p * p END (* prevent overflow *)
  549. END;
  550. RETURN x
  551. END Ten;
  552. (** return decimal number associated to character ch , error if none **)
  553. PROCEDURE Decimal( ch: CHAR ): LONGINT;
  554. BEGIN (* ("0" <= ch) & (ch <= "9") OR ("A" <= ch) & (ch <= "F") *)
  555. IF ch <= "9" THEN RETURN ORD( ch ) - ORD( "0" ) ELSE Error( Basic.NumberIllegalCharacter ); RETURN 0 END
  556. END Decimal;
  557. (** return hexadecimal number associated to character ch, error if none **)
  558. PROCEDURE Hexadecimal( ch: CHAR ): LONGINT;
  559. BEGIN
  560. IF ch <= "9" THEN RETURN ORD( ch ) - ORD( "0" )
  561. ELSIF ch <= "F" THEN RETURN ORD( ch ) - ORD( "A" ) + 10
  562. ELSIF ch <= "f" THEN RETURN ORD( ch ) - ORD( "a" ) + 10
  563. ELSE Error( Basic.NumberIllegalCharacter ); RETURN 0
  564. END
  565. END Hexadecimal;
  566. PROCEDURE IsHexDigit(ch: CHAR): BOOLEAN;
  567. BEGIN
  568. RETURN (ch >= "0") & (ch <= "9") OR (ch >= "a") & (ch <="f") OR (ch >= "A") & (ch <= "F")
  569. END IsHexDigit;
  570. PROCEDURE IsBinaryDigit(ch: CHAR): BOOLEAN;
  571. BEGIN
  572. RETURN (ch >= "0") & (ch <= "1")
  573. END IsBinaryDigit;
  574. BEGIN (* ("0" <= ch) & (ch <= "9") *)
  575. result := Number;
  576. i := 0; m := 0; n := 0; d := 0; si := 0; long := FALSE;
  577. IF (ch = "0") THEN
  578. IF (reader.Peek() = "x") THEN (* hex number *)
  579. digits := 0;
  580. GetNextCharacter; GetNextCharacter;
  581. IF (ch = "'")& IsHexDigit(reader.Peek()) THEN GetNextCharacter END;
  582. WHILE IsHexDigit(ch) DO
  583. number := number * 10H + Hexadecimal(ch);
  584. INC(digits);
  585. GetNextCharacter;
  586. IF (ch = "'") & IsHexDigit(reader.Peek()) THEN GetNextCharacter END;
  587. END;
  588. token.hugeint := number;
  589. token.integer := SHORT(number);
  590. IF (digits > MaxHexDigits) OR (digits = MaxHexDigits) & (number > MAX(LONGINT)) THEN
  591. token.numberType := Hugeint
  592. ELSE
  593. token.numberType := Integer
  594. END;
  595. RETURN result;
  596. ELSIF reader.Peek() = "b" THEN (* binary number *)
  597. digits := 0;
  598. GetNextCharacter; GetNextCharacter;
  599. IF (ch = "'") & IsBinaryDigit(reader.Peek()) THEN GetNextCharacter END;
  600. WHILE IsBinaryDigit(ch) DO
  601. number := number * 2;
  602. INC(digits);
  603. IF ch = "1" THEN INC(number) END;
  604. GetNextCharacter;
  605. IF (ch = "'") & IsBinaryDigit(reader.Peek()) THEN GetNextCharacter END;
  606. END;
  607. token.hugeint := number;
  608. token.integer := SHORT(number);
  609. IF digits > 32 THEN
  610. token.numberType := Hugeint
  611. ELSE
  612. token.numberType := Integer
  613. END;
  614. RETURN result;
  615. END;
  616. END;
  617. LOOP (* read mantissa *)
  618. IF ("0" <= ch) & (ch <= "9") OR (d = 0) & ("A" <= ch) & (ch <= "F") THEN
  619. IF (m > 0) OR (ch # "0") THEN (* ignore leading zeros *)
  620. IF n < LEN( dig ) THEN dig[n] := ch; INC( n ) END;
  621. INC( m )
  622. END;
  623. token.identifierString[si] := ch; INC( si ); GetNextCharacter; INC( i )
  624. ELSIF ch = "." THEN
  625. token.identifierString[si] := ch; INC( si ); GetNextCharacter;
  626. IF ch = "." THEN ch := Ellipsis; EXIT
  627. ELSIF d = 0 THEN (* i > 0 *) d := i
  628. ELSE Error( Basic.NumberIllegalCharacter )
  629. END
  630. ELSIF ch = "'" THEN GetNextCharacter; (* ignore *)
  631. ELSE EXIT
  632. END
  633. END; (* 0 <= n <= m <= i, 0 <= d <= i *)
  634. IF d = 0 THEN (* integer *)
  635. IF n = m THEN
  636. token.integer := 0; i := 0; token.hugeint := 0;
  637. IF ch = "X" THEN (* character *)
  638. token.identifierString[si] := ch; INC( si ); GetNextCharacter; result := Character;
  639. IF (n <= 2) THEN
  640. WHILE i < n DO token.integer := token.integer * 10H + Hexadecimal( dig[i] ); INC( i ) END;
  641. token.character := CHR(token.integer);
  642. ELSE Error( Basic.NumberTooLarge )
  643. END
  644. ELSIF ch = "H" THEN (* hexadecimal *)
  645. token.identifierString[si] := ch; INC( si ); GetNextCharacter;
  646. IF (n < MaxHexDigits) OR (n=MaxHexDigits) & (dig[0] <= "7") THEN (* otherwise the positive (!) number is not in the range of longints *)
  647. token.numberType := Integer;
  648. (* IF (n = MaxHexDigits) & (dig[0] > "7") THEN (* prevent overflow *) token.integer := -1 END; *)
  649. WHILE i < n DO token.integer := token.integer * 10H + Hexadecimal( dig[i] ); INC( i ) END;
  650. token.hugeint := token.integer;
  651. ELSIF n <= MaxHugeHexDigits THEN
  652. token.numberType := Hugeint;
  653. IF (n = MaxHugeHexDigits) & (dig[0] > "7") THEN (* prevent overflow *) token.hugeint := -1 END;
  654. WHILE i < n DO token.hugeint := Hexadecimal( dig[i] ) + token.hugeint * 10H; INC( i ) END;
  655. token.integer :=SHORT(token.hugeint);
  656. ELSE
  657. token.numberType := Hugeint; (* to make parser able to go on *)
  658. Error( Basic.NumberTooLarge )
  659. END
  660. ELSE (* decimal *)
  661. token.numberType := Integer;
  662. WHILE (i < n) & ~long DO
  663. d := Decimal( dig[i] ); INC( i );
  664. IF token.integer >= MAX(LONGINT) DIV 10 THEN (* multiplication overflow *)long := TRUE END;
  665. nextInt := token.integer*10+d;
  666. IF nextInt >=0 THEN token.integer := nextInt ELSE (* overflow *) long := TRUE END;
  667. END;
  668. IF long THEN
  669. i := 0; (* restart computation , artificial limit because of compiler problems with hugeint *)
  670. hugeint := 0;
  671. tenh := 10; (* compiler does not like constants here ! *)
  672. token.numberType := Hugeint;
  673. WHILE i < n DO
  674. d := Decimal( dig[i] ); INC( i );
  675. IF hugeint > MAX(HUGEINT) DIV 10 THEN Error( Basic.NumberTooLarge) END;
  676. hugeint := hugeint * tenh + d;
  677. IF hugeint < 0 THEN Error( Basic.NumberTooLarge ) END
  678. END;
  679. token.hugeint := hugeint;
  680. token.integer := SHORT(token.hugeint);
  681. ELSE
  682. token.hugeint := token.integer;
  683. END
  684. END
  685. ELSE
  686. token.numberType := Hugeint;
  687. Error( Basic.NumberTooLarge )
  688. END
  689. ELSE (* fraction *)
  690. f := 0; e := 0; expCh := "E";
  691. WHILE n > 0 DO (* 0 <= f < 1 *) DEC( n ); f := (Decimal( dig[n] ) + f) / 10 END;
  692. IF (ch = "E") OR (ch = "D") THEN
  693. expCh := ch; token.identifierString[si] := ch; INC( si ); GetNextCharacter; neg := FALSE;
  694. IF ch = "-" THEN neg := TRUE; token.identifierString[si] := ch; INC( si ); GetNextCharacter
  695. ELSIF ch = "+" THEN token.identifierString[si] := ch; INC( si ); GetNextCharacter
  696. END;
  697. IF ("0" <= ch) & (ch <= "9") THEN
  698. REPEAT
  699. n := Decimal( ch ); token.identifierString[si] := ch; INC( si ); GetNextCharacter;
  700. IF e <= (MAX( INTEGER ) - n) DIV 10 THEN e := e * 10 + n ELSE Error( Basic.NumberTooLarge ) END
  701. UNTIL (ch < "0") OR ("9" < ch);
  702. IF neg THEN e := -e END
  703. ELSE Error( Basic.NumberIllegalCharacter )
  704. END
  705. END;
  706. DEC( e, i - d - m ); (* decimal point shift *)
  707. IF expCh = "E" THEN
  708. token.numberType := Real;
  709. IF (1 - MaxRealExponent < e) & (e <= MaxRealExponent) THEN
  710. IF e < 0 THEN token.real := f / Ten( -e ) ELSE token.real := f * Ten( e ) END
  711. ELSE Error( Basic.NumberTooLarge )
  712. END
  713. ELSE
  714. token.numberType := Longreal;
  715. IF (1 - MaxLongrealExponent < e) & (e <= MaxLongrealExponent) THEN
  716. IF e < 0 THEN token.real := f / Ten( -e ) ELSE token.real := f * Ten( e ) END
  717. ELSE Error( Basic.NumberTooLarge )
  718. END
  719. END
  720. END;
  721. token.identifierString[si] := 0X;
  722. RETURN result;
  723. END GetNumber;
  724. (** read / skip a comment **)
  725. PROCEDURE ReadComment(VAR token: Token);
  726. VAR level: LONGINT;
  727. BEGIN
  728. stringMaker.Clear;
  729. level := 1;
  730. WHILE (level > 0) & (ch # EOT) DO
  731. IF ch = "(" THEN
  732. stringWriter.Char(ch);
  733. GetNextCharacter;
  734. IF ch = "*" THEN INC(level); stringWriter.Char(ch); GetNextCharacter; END;
  735. ELSIF ch = "*" THEN
  736. stringWriter.Char(ch);
  737. GetNextCharacter;
  738. IF ch =")" THEN DEC(level); stringWriter.Char(ch); GetNextCharacter; END;
  739. ELSE
  740. stringWriter.Char(ch);
  741. GetNextCharacter;
  742. END;
  743. END;
  744. IF level > 0 THEN
  745. Error(Basic.CommentNotClosed)
  746. END;
  747. stringWriter.Char(0X);
  748. stringWriter.Update;
  749. stringMaker.Shorten(2); (* remove comment closing *)
  750. token.symbol := Comment;
  751. token.string := stringMaker.GetString(token.stringLength);
  752. END ReadComment;
  753. PROCEDURE SkipToEndOfCode*(VAR startPos,endPos: LONGINT; VAR token: Token): Symbol;
  754. VAR s: LONGINT; newline: BOOLEAN;
  755. BEGIN
  756. ASSERT(case # Unknown);
  757. stringMaker.Clear;
  758. startPos := token.position.end;
  759. s := Code; newline := FALSE;
  760. WHILE (s # EndOfText) & (s # End) & (s # With) & (s # Unequal) DO
  761. token.position := position;
  762. endPos := position.start;
  763. IF (ch >= 'A') & (ch <= 'Z') OR (ch >= 'a') & (ch <= 'z') THEN
  764. newline := FALSE;
  765. GetIdentifier(token);
  766. IF (case=Uppercase) & (token.identifierString = "END") OR (case=Lowercase) & (token.identifierString = "end") THEN
  767. s := End
  768. ELSIF (case = Uppercase) & (token.identifierString = "WITH") OR (case = Lowercase) & (token.identifierString = "with") THEN
  769. s := With
  770. ELSE
  771. stringWriter.String(token.identifierString);
  772. END;
  773. ELSIF (ch = '#') & newline THEN
  774. s := Unequal;
  775. GetNextCharacter;
  776. ELSE
  777. IF ch > ' ' THEN newline := FALSE;
  778. ELSIF (ch = CR) OR (ch = LF) THEN newline := TRUE;
  779. END;
  780. stringWriter.Char(ch);
  781. GetNextCharacter;
  782. END;
  783. token.position.end := position.start;
  784. END;
  785. stringWriter.Update;
  786. token.string := stringMaker.GetStringCopy(token.stringLength);
  787. token.symbol := s;
  788. IF Trace THEN
  789. D.String("skip to end: "); D.Int(startPos,1); D.String(","); D.Int(endPos,1); D.Ln;
  790. PrintToken(D.Log,token); D.Ln;
  791. END;
  792. RETURN s
  793. END SkipToEndOfCode;
  794. PROCEDURE SkipBlanks;
  795. BEGIN
  796. WHILE (ch <= " ") & (ch # ESC) DO (*ignore control characters*)
  797. IF ch = EOT THEN
  798. IF Trace THEN D.String("EOT"); D.Ln; END;
  799. RETURN
  800. ELSE GetNextCharacter
  801. END
  802. END;
  803. END SkipBlanks;
  804. (** get next token **)
  805. PROCEDURE GetNextToken*(VAR token: Token ): BOOLEAN;
  806. VAR s,symbol: LONGINT;
  807. BEGIN
  808. SkipBlanks;
  809. token.position := position;
  810. stringMaker.Clear;
  811. CASE ch OF (* ch > " " *)
  812. EOT: s := EndOfText
  813. |ESC: s := Escape; GetNextCharacter
  814. | DoubleQuote:
  815. s := String; GetString(token,TRUE, TRUE, FALSE);
  816. | SingleQuote:
  817. s := String; GetString(token,FALSE, FALSE,FALSE);
  818. (* to be replaced by:
  819. s := Character; GetString(token);
  820. IF token.stringLength #2 THEN (* stringlength = 1 for empty string '' *)
  821. Error(Basic.IllegalCharacterValue)
  822. END;
  823. *)
  824. | '#': s := Unequal; GetNextCharacter
  825. | '&': s := And; GetNextCharacter
  826. | '(': GetNextCharacter;
  827. IF ch = '*' THEN GetNextCharacter; ReadComment(token); s := Comment; ELSE s := LeftParenthesis END
  828. | ')': s := RightParenthesis; GetNextCharacter
  829. | '*': GetNextCharacter; IF ch = '*' THEN GetNextCharacter; s := TimesTimes ELSE s := Times END
  830. | '+': GetNextCharacter; IF ch = '*' THEN GetNextCharacter; s := PlusTimes ELSE s := Plus END
  831. | ',': s := Comma; GetNextCharacter
  832. | '-': s := Minus; GetNextCharacter
  833. | '.': GetNextCharacter;
  834. IF ch = '.' THEN GetNextCharacter; s := Upto;
  835. ELSIF ch = '*' THEN GetNextCharacter; s := DotTimes;
  836. ELSIF ch = '/' THEN GetNextCharacter; s := DotSlash;
  837. ELSIF ch='=' THEN GetNextCharacter; s := DotEqual;
  838. ELSIF ch='#' THEN GetNextCharacter; s := DotUnequal;
  839. ELSIF ch='>' THEN GetNextCharacter;
  840. IF ch='=' THEN s := DotGreaterEqual; GetNextCharacter
  841. ELSE s := DotGreater;
  842. END
  843. ELSIF ch='<' THEN GetNextCharacter;
  844. IF ch='=' THEN s := DotLessEqual; GetNextCharacter
  845. ELSE s := DotLess;
  846. END
  847. ELSE s := Period END
  848. | '/': s := Slash; GetNextCharacter
  849. | '0'..'9': s := GetNumber(token);
  850. | ':': GetNextCharacter;
  851. IF ch = '=' THEN GetNextCharacter; s := Becomes ELSE s := Colon END
  852. | ';': s := Semicolon; GetNextCharacter
  853. | '<': GetNextCharacter;
  854. IF ch = '=' THEN GetNextCharacter; s := LessEqual
  855. ELSIF ch ='<' THEN GetNextCharacter;
  856. IF ch ='?' THEN GetNextCharacter; s := LessLessQ
  857. ELSE s := LessLess
  858. END;
  859. ELSE s := Less;
  860. END
  861. | '=': s := Equal; GetNextCharacter
  862. | '>': GetNextCharacter;
  863. IF ch = '=' THEN GetNextCharacter; s := GreaterEqual
  864. ELSIF ch ='>' THEN GetNextCharacter;
  865. IF ch ='?' THEN GetNextCharacter; s := GreaterGreaterQ
  866. ELSE s := GreaterGreater
  867. END;
  868. ELSE s := Greater; END
  869. | '[': s := LeftBracket; GetNextCharacter
  870. | ']': s := RightBracket; GetNextCharacter
  871. | '^': s := Arrow; GetNextCharacter
  872. | '{': s := LeftBrace; GetNextCharacter
  873. | '|': s := Bar; GetNextCharacter
  874. | '}': s := RightBrace; GetNextCharacter
  875. | '~': s := Not; GetNextCharacter
  876. | '\': s := Backslash; GetNextCharacter;
  877. IF ch = DoubleQuote THEN
  878. s := String;
  879. GetEscapedString(token);
  880. (*
  881. GetString(token, TRUE, TRUE, TRUE)
  882. *)
  883. ELSIF (ch > " ") & (reader.Peek() = DoubleQuote) THEN
  884. s := String;
  885. GetEscapedString(token);
  886. END;
  887. | '`': s := Transpose; GetNextCharacter
  888. | '?': s := Questionmark; GetNextCharacter; IF ch = '?' THEN s := Questionmarks; GetNextCharacter END;
  889. | '!': s := ExclamationMark; GetNextCharacter; IF ch = '!' THEN s := ExclamationMarks; GetNextCharacter END;
  890. | Ellipsis:
  891. s := Upto; GetNextCharacter
  892. | 'A'..'Z': s := Identifier; GetIdentifier( token );
  893. IF (case=Uppercase) OR (case=Unknown) THEN
  894. symbol := keywordsUpper.IndexByIdentifier(token.identifier);
  895. IF (symbol >= 0) THEN s := symbol END;
  896. IF (s = Module) OR (s=CellNet) THEN case := Uppercase END;
  897. END;
  898. | 'a'..'z': s := Identifier; GetIdentifier( token);
  899. IF (case = Lowercase) OR (case=Unknown) THEN
  900. symbol := keywordsLower.IndexByIdentifier(token.identifier);
  901. IF (symbol >= 0) THEN s := symbol END;
  902. IF (s = Module) OR (s=CellNet) THEN case := Lowercase END;
  903. END;
  904. IF firstIdentifier & (s # Module) & (s # CellNet) & (case = Unknown) THEN case := Uppercase; s := Identifier END;
  905. ELSE s := Identifier; GetIdentifier( token );
  906. END;
  907. firstIdentifier := FALSE;
  908. token.symbol := s;
  909. token.position.end := position.start;
  910. IF Trace THEN PrintToken(D.Log,token); D.Ln; END;
  911. RETURN ~error
  912. END GetNextToken;
  913. PROCEDURE ResetError*();
  914. BEGIN error := FALSE
  915. END ResetError;
  916. (** set the diagnostics mode of the scanner (diagnostics = NIL ==> no report) and reset the error state
  917. intended for silent token peeeking after the end of a module *)
  918. PROCEDURE ResetErrorDiagnostics*(VAR diagnostics: Diagnostics.Diagnostics);
  919. VAR d: Diagnostics.Diagnostics;
  920. BEGIN
  921. error := FALSE;
  922. d := SELF.diagnostics; SELF.diagnostics := diagnostics; diagnostics := d;
  923. END ResetErrorDiagnostics;
  924. END Scanner;
  925. Context*=RECORD
  926. position: Position;
  927. readerPosition : LONGINT;
  928. ch: CHAR;
  929. END;
  930. (** assembler scanner reflects the following EBNF
  931. Token = String | Symbol | Number | Identifier.
  932. Symbol = '\' | '#' | '(' ['*' any '*' ')'] | ')' | CR [LF] | LF | '*' | '+' | ',' | '-' | '~' | '.' | '/' | '%' | ':' | ';' | '=' | '[' | ']' | '{' | '}' | '!' | '^' | '$'['$'].
  933. String = '"' {Character} '"' | "'" {Character} "'".
  934. Identifier = '@' | Letter {'@' | '.' | Letter | Digit | '_'} .
  935. Letter = 'A' | 'B' | .. | 'Z' | 'a' | 'b' | .. | 'z' .
  936. Digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'.
  937. BinaryDigit = '0' | '1' .
  938. Number = Integer | Real.
  939. Character = Digit [HexDigit] 'X'.
  940. Integer = Digit {Digit} | Digit {HexDigit} 'H' | '0x' {HexDigit} | '0b' {BinaryDigit}.
  941. Real = Digit {Digit} '.' {Digit} [ScaleFactor].
  942. ScaleFactor = ('E' | 'D') ['+' | '-'] digit {digit}.
  943. HexDigit = Digit | 'A' | 'B' | 'C' | 'D' | 'E' | 'F'.
  944. **)
  945. AssemblerScanner* = OBJECT (Scanner) (*! move to different module? unify with compiler scanner? *)
  946. VAR
  947. startContext-: Context;
  948. PROCEDURE &InitAssemblerScanner*( CONST source: ARRAY OF CHAR; reader: Streams.Reader; position: Position; diagnostics: Diagnostics.Diagnostics );
  949. BEGIN
  950. InitializeScanner(source,reader,position,diagnostics);
  951. GetContext(startContext);
  952. END InitAssemblerScanner;
  953. PROCEDURE GetContext*(VAR context: Context);
  954. BEGIN
  955. context.ch := ch;
  956. context.position := position;
  957. context.readerPosition := reader.Pos();
  958. END GetContext;
  959. PROCEDURE SetContext*(CONST context: Context);
  960. BEGIN
  961. reader.SetPos(context.readerPosition);
  962. ch := context.ch;
  963. position := context.position;
  964. END SetContext;
  965. PROCEDURE SkipToEndOfLine*;
  966. BEGIN
  967. WHILE (ch # EOT) & (ch # CR) & (ch # LF) DO
  968. GetNextCharacter
  969. END;
  970. END SkipToEndOfLine;
  971. (**
  972. note: in contrast to a regular identifier, an assembler scanner identifier may also contain periods and the '@'-token
  973. Identifier = '@' | Letter {'@' | '.' | Letter | Digit | '_'} .
  974. Letter = 'A' | 'B' | .. | 'Z' | 'a' | 'b' | .. | 'z' .
  975. Digit = '0' | '1' | '2' | '3' | '4' | '5' | '6' | '7' | '8' | '9'.
  976. '_' is the underscore character
  977. **)
  978. PROCEDURE GetIdentifier( VAR token: Token );
  979. VAR
  980. i: LONGINT;
  981. PROCEDURE CharacterIsAllowed(character: CHAR): BOOLEAN;
  982. BEGIN
  983. CASE character OF
  984. | 'a' .. 'z', 'A' .. 'Z', '0' .. '9', '@', '.', '_': RETURN TRUE
  985. ELSE RETURN FALSE
  986. END;
  987. END CharacterIsAllowed;
  988. BEGIN
  989. i := 0;
  990. REPEAT
  991. token.identifierString[i] := ch; INC( i ); GetNextCharacter
  992. UNTIL ~CharacterIsAllowed(ch) OR (i = MaxIdentifierLength);
  993. IF i = MaxIdentifierLength THEN Error( Basic.IdentifierTooLong ); DEC( i ) END;
  994. token.identifierString[i] := 0X;
  995. END GetIdentifier;
  996. (** get next token **)
  997. PROCEDURE GetNextToken*(VAR token: Token ): BOOLEAN;
  998. VAR s: LONGINT;
  999. PROCEDURE SkipBlanks;
  1000. BEGIN
  1001. WHILE (ch <= ' ') & (ch # CR) & (ch # LF) & (ch # EOT) DO (* ignore control characters except line feeds *)
  1002. GetNextCharacter
  1003. END;
  1004. END SkipBlanks;
  1005. BEGIN
  1006. REPEAT
  1007. SkipBlanks;
  1008. token.position := position;
  1009. CASE ch OF (* ch > ' ' *)
  1010. | EOT: s := EndOfText;
  1011. | DoubleQuote:
  1012. s := String; GetString(token, TRUE, FALSE, TRUE);
  1013. | SingleQuote:
  1014. s := Character; GetString(token, FALSE, FALSE, FALSE); token.character := token.string[0];
  1015. IF token.stringLength #2 THEN (* stringlength = 1 for empty string '' *)
  1016. Error(Basic.IllegalCharacterValue)
  1017. END;
  1018. | '\': s := Backslash; GetNextCharacter;
  1019. IF ch = DoubleQuote THEN s := String; GetString(token, FALSE, FALSE, TRUE) END;
  1020. | '#': s := Unequal; GetNextCharacter; (* for the ARM assembler *)
  1021. | '(': GetNextCharacter;
  1022. IF ch = '*' THEN GetNextCharacter; ReadComment(token); s := Comment; ELSE s := LeftParenthesis END
  1023. | ')': s := RightParenthesis; GetNextCharacter
  1024. | CR: GetNextCharacter; s := Ln;IF ch = LF THEN GetNextCharacter END;
  1025. | LF: GetNextCharacter; s := Ln; IF ch = CR THEN GetNextCharacter END;
  1026. | '*': s := Times; GetNextCharacter;
  1027. | '+': s := Plus ; GetNextCharacter;
  1028. | ',': s := Comma; GetNextCharacter
  1029. | '-': s := Minus; GetNextCharacter
  1030. | '~': s := Not; GetNextCharacter
  1031. | '.': s:= Period; GetNextCharacter
  1032. | '/': s := Div; GetNextCharacter
  1033. | '%': s := Mod; GetNextCharacter
  1034. | '0'..'9': s := GetNumber(token);
  1035. | ':': s := Colon; GetNextCharacter;
  1036. | ';': s := Comment; SkipToEndOfLine;
  1037. | '=': s := Equal; GetNextCharacter
  1038. | '[': s := LeftBracket; GetNextCharacter
  1039. | ']': s := RightBracket; GetNextCharacter
  1040. | '{': s := LeftBrace; GetNextCharacter
  1041. | '}': s := RightBrace; GetNextCharacter
  1042. | '!': s := ExclamationMark; GetNextCharacter;
  1043. | '^': s := Arrow; GetNextCharacter;
  1044. | 'A'..'Z': s := Identifier; GetIdentifier( token );
  1045. | 'a'..'z': s := Identifier; GetIdentifier( token);
  1046. | '@': s := Identifier; GetIdentifier( token); (* the '@'-token initiates an assembly scanner identifier *)
  1047. | '$': GetNextCharacter;
  1048. IF ch = '$' THEN s := PCOffset; GetNextCharacter ELSE s := PC; END
  1049. ELSE s := None; GetNextCharacter;
  1050. END;
  1051. token.position.end := position.start;
  1052. UNTIL s # Comment;
  1053. token.symbol := s;
  1054. IF Trace THEN D.Ln; D.Str( "Scan at " ); D.Int( token.position.start,1 ); D.Str( ": " ); PrintToken(D.Log,token); D.Update; END;
  1055. RETURN ~error
  1056. END GetNextToken;
  1057. END AssemblerScanner;
  1058. VAR
  1059. reservedCharacter: ARRAY 256 OF BOOLEAN;
  1060. symbols-: ARRAY EndOfText+1 OF Keyword;
  1061. keywordsLower, keywordsUpper: KeywordTable;
  1062. (** return a new scanner on a stream, error output via diagnostics **)
  1063. PROCEDURE NewScanner*( CONST source: ARRAY OF CHAR; reader: Streams.Reader; position: LONGINT; diagnostics: Diagnostics.Diagnostics ): Scanner;
  1064. VAR s: Scanner; pos: Position;
  1065. BEGIN
  1066. pos.start := position;
  1067. pos.end := position;
  1068. pos.line := 1;
  1069. pos.linepos := 0;
  1070. NEW( s, source, reader, pos, diagnostics ); RETURN s;
  1071. END NewScanner;
  1072. PROCEDURE NewAssemblerScanner*( CONST source: ARRAY OF CHAR; reader: Streams.Reader; position: LONGINT; diagnostics: Diagnostics.Diagnostics ): AssemblerScanner;
  1073. VAR s: AssemblerScanner;pos: Position;
  1074. BEGIN
  1075. pos.start := position;
  1076. pos.end := position;
  1077. pos.line := 1;
  1078. pos.linepos := 0;
  1079. NEW( s, source, reader, pos, diagnostics ); RETURN s;
  1080. END NewAssemblerScanner;
  1081. PROCEDURE TokenToString*(CONST token: Token; case: LONGINT; VAR str: ARRAY OF CHAR);
  1082. VAR id: StringPool.Index;
  1083. BEGIN
  1084. CASE token.symbol OF
  1085. Identifier, Number: COPY(token.identifierString, str)
  1086. | String, Comment: ASSERT(LEN(str) >= LEN(token.string^)); COPY(token.string^, str);
  1087. ELSE
  1088. GetKeyword(case, token.symbol, id);
  1089. IF id < 0 THEN str := "" ELSE StringPool.GetString(id, str) END;
  1090. END;
  1091. END TokenToString;
  1092. (** debugging output **)
  1093. PROCEDURE PrintToken*(w: Streams.Writer; CONST token: Token);
  1094. VAR str: ARRAY 256 OF CHAR;
  1095. BEGIN
  1096. w.Int(token.position.start,1); w.String("-");w.Int(token.position.end,1); w.String(":");
  1097. w.String(symbols[token.symbol]);
  1098. IF token.symbol= Number THEN
  1099. CASE token.numberType OF
  1100. Integer: w.String("(integer)")
  1101. |Hugeint: w.String("(hugeint)")
  1102. |Real: w.String("(real)")
  1103. |Longreal: w.String("(longreal)")
  1104. END;
  1105. END;
  1106. IF token.symbol = String THEN
  1107. w.String(":"); w.Char('"'); w.String(token.string^); w.Char('"');
  1108. ELSIF token.symbol = Comment THEN
  1109. w.String("(*"); w.String(token.string^); w.String("*)");
  1110. ELSE
  1111. TokenToString(token, Uppercase, str); w.String(": "); w.String(str);
  1112. END
  1113. END PrintToken;
  1114. (** reserved characters are the characters that may not occur within an identifier **)
  1115. PROCEDURE InitReservedCharacters;
  1116. VAR i: LONGINT;
  1117. BEGIN
  1118. FOR i := 0 TO LEN( reservedCharacter ) - 1 DO
  1119. CASE CHR(i) OF
  1120. | 'a' .. 'z', 'A' .. 'Z': reservedCharacter[i] := FALSE;
  1121. | '0'..'9': reservedCharacter[i] := FALSE;
  1122. | '_': reservedCharacter[i] := FALSE
  1123. ELSE
  1124. reservedCharacter[i] := TRUE
  1125. END;
  1126. END;
  1127. END InitReservedCharacters;
  1128. (* get keyword by symbol *)
  1129. PROCEDURE GetKeyword*(case:LONGINT; symbol: LONGINT; VAR identifier: IdentifierType);
  1130. BEGIN
  1131. IF case = Uppercase THEN
  1132. keywordsUpper.IdentifierByIndex(symbol,identifier);
  1133. ELSE ASSERT(case=Lowercase);
  1134. keywordsLower.IdentifierByIndex(symbol,identifier);
  1135. END;
  1136. END GetKeyword;
  1137. PROCEDURE InitSymbols;
  1138. VAR i: LONGINT;
  1139. BEGIN
  1140. symbols[None] := "None";
  1141. symbols[Equal] := "Equal";
  1142. symbols[DotEqual] := "DotEqual";
  1143. symbols[Unequal] := "Unequal";
  1144. symbols[DotUnequal] := "DotUnequal";
  1145. symbols[Less] := "Less";
  1146. symbols[DotLess] := "DotLess";
  1147. symbols[LessEqual] := "LessEqual";
  1148. symbols[DotLessEqual] := "DotLessEqual";
  1149. symbols[Greater] := "Greater";
  1150. symbols[DotGreater] := "DotGreater";
  1151. symbols[GreaterEqual] := "GreaterEqual";
  1152. symbols[DotGreaterEqual] := "DotGreaterEqual";
  1153. symbols[LessLessQ] := "LessLessQ";
  1154. symbols[GreaterGreaterQ] := "GreaterGreaterQ";
  1155. symbols[In] := "In";
  1156. symbols[Is] := "Is";
  1157. symbols[Times] := "Times";
  1158. symbols[TimesTimes] := "TimesTimes";
  1159. symbols[DotTimes] := "DotTimes";
  1160. symbols[PlusTimes] := "PlusTimes";
  1161. symbols[Slash] := "Slash";
  1162. symbols[Backslash] := "Backslash";
  1163. symbols[DotSlash] := "DotSlash";
  1164. symbols[Div] := "Div";
  1165. symbols[Mod] := "Mod";
  1166. symbols[And] := "And";
  1167. symbols[Or] := "Or";
  1168. symbols[Plus] := "Plus";
  1169. symbols[Minus] := "Minus";
  1170. symbols[Not] := "Not";
  1171. symbols[LeftParenthesis] := "LeftParenthesis";
  1172. symbols[LeftBracket] := "LeftBracket";
  1173. symbols[LeftBrace] := "LeftBrace";
  1174. symbols[Number] := "Number";
  1175. symbols[Character] := "Character";
  1176. symbols[String] := "String";
  1177. symbols[Nil] := "Nil";
  1178. symbols[Imag] := "Imag";
  1179. symbols[True] := "True";
  1180. symbols[False] := "False";
  1181. symbols[Self] := "Self";
  1182. symbols[New] := "New";
  1183. symbols[Result] := "Result";
  1184. symbols[Identifier] := "Identifier";
  1185. symbols[If] := "If";
  1186. symbols[Case] := "Case";
  1187. symbols[While] := "While";
  1188. symbols[Repeat] := "Repeat";
  1189. symbols[For] := "For";
  1190. symbols[Loop] := "Loop";
  1191. symbols[With] := "With";
  1192. symbols[Exit] := "Exit";
  1193. symbols[Await] := "Await";
  1194. symbols[Return] := "Return";
  1195. symbols[Ignore] := "Ignore";
  1196. symbols[Begin] := "Begin";
  1197. symbols[Semicolon] := "Semicolon";
  1198. symbols[Transpose] := "Transpose";
  1199. symbols[RightBrace] := "RightBrace";
  1200. symbols[RightBracket] := "RightBracket";
  1201. symbols[RightParenthesis] := "RightParenthesis";
  1202. symbols[Questionmark] := "Questionmark";
  1203. symbols[ExclamationMark] := "ExclamationMark";
  1204. symbols[Questionmarks] := "Questionmarks";
  1205. symbols[ExclamationMarks] := "ExclamationMarks";
  1206. symbols[LessLess] := "LessLess";
  1207. symbols[GreaterGreater] := "GreaterGreater";
  1208. symbols[Upto] := "Upto";
  1209. symbols[Arrow] := "Arrow";
  1210. symbols[Period] := "Period";
  1211. symbols[Comma] := "Comma";
  1212. symbols[Colon] := "Colon";
  1213. symbols[Of] := "Of";
  1214. symbols[Then] := "Then";
  1215. symbols[Do] := "Do";
  1216. symbols[To] := "To";
  1217. symbols[By] := "By";
  1218. symbols[Becomes] := "Becomes";
  1219. symbols[Bar] := "Bar";
  1220. symbols[End] := "End";
  1221. symbols[Else] := "Else";
  1222. symbols[Elsif] := "Elsif";
  1223. symbols[Extern] := "Extern";
  1224. symbols[Until] := "Until";
  1225. symbols[Finally] := "Finally";
  1226. symbols[Code] := "Code";
  1227. symbols[Const] := "Const";
  1228. symbols[Type] := "Type";
  1229. symbols[Var] := "Var";
  1230. symbols[Out] := "Out";
  1231. symbols[Procedure] := "Procedure";
  1232. symbols[Operator] := "Operator";
  1233. symbols[Import] := "Import";
  1234. symbols[Definition] := "Definition";
  1235. symbols[Module] := "Module";
  1236. symbols[Cell] := "Cell";
  1237. symbols[CellNet] := "CellNet";
  1238. symbols[Array] := "Array";
  1239. symbols[Object] := "Object";
  1240. symbols[Record] := "Record";
  1241. symbols[Pointer] := "Pointer";
  1242. symbols[Enum] := "Enum";
  1243. symbols[Port] := "Port";
  1244. symbols[Address] := "Address";
  1245. symbols[Alias] := "Alias";
  1246. symbols[Size] := "Size";
  1247. symbols[Ln] := "Ln";
  1248. symbols[PC] := "PC";
  1249. symbols[PCOffset] := "PCOffset";
  1250. symbols[Shortint] := "Shortint";
  1251. symbols[Integer] := "Integer";
  1252. symbols[Longint] := "Longint";
  1253. symbols[Hugeint] := "Hugeint";
  1254. symbols[Real] := "Real";
  1255. symbols[Longreal] := "Longreal";
  1256. symbols[Comment] := "Comment";
  1257. symbols[EndOfText] := "EndOfText";
  1258. FOR i := 0 TO EndOfText DO ASSERT(symbols[i] # "") END;
  1259. END InitSymbols;
  1260. (** enter keywords in the list of keywords (both upper- and lowercase) **)
  1261. PROCEDURE InitKeywords;
  1262. PROCEDURE Upper(CONST source: ARRAY OF CHAR; VAR dest: ARRAY OF CHAR);
  1263. VAR c: CHAR; i: LONGINT;
  1264. BEGIN
  1265. i := 0;
  1266. REPEAT
  1267. c := source[i];
  1268. IF (c >= 'a') & (c<= 'z') THEN c := CHR(ORD(c)-ORD('a')+ORD('A')) END;
  1269. dest[i] := c; INC(i);
  1270. UNTIL c = 0X;
  1271. END Upper;
  1272. PROCEDURE Enter1(CONST name: ARRAY OF CHAR; symbol: LONGINT; case: SET);
  1273. BEGIN
  1274. IF Lowercase IN case THEN keywordsLower.PutString(name,symbol) END;
  1275. IF Uppercase IN case THEN keywordsUpper.PutString(name,symbol) END;
  1276. Basic.SetErrorExpected(symbol,name);
  1277. END Enter1;
  1278. PROCEDURE Enter(CONST name: ARRAY OF CHAR; symbol: LONGINT);
  1279. VAR upper: Keyword;
  1280. BEGIN
  1281. Enter1(name,symbol,{Lowercase});
  1282. Upper(name,upper);
  1283. Enter1(upper,symbol,{Uppercase});
  1284. END Enter;
  1285. PROCEDURE EnterSymbol(CONST name: ARRAY OF CHAR; symbol: LONGINT);
  1286. BEGIN
  1287. Enter1(name,symbol,{Lowercase,Uppercase});
  1288. END EnterSymbol;
  1289. BEGIN
  1290. NEW(keywordsUpper,EndOfText+1);
  1291. NEW(keywordsLower,EndOfText+1);
  1292. (* constructs and statements *)
  1293. Enter( "cell", Cell );
  1294. Enter( "cellnet", CellNet);
  1295. Enter( "await" , Await);
  1296. Enter( "begin" , Begin);
  1297. Enter( "by" , By);
  1298. Enter( "const" , Const);
  1299. Enter( "case" , Case);
  1300. Enter( "code" , Code);
  1301. Enter( "definition", Definition);
  1302. Enter( "do" , Do);
  1303. Enter( "div" , Div);
  1304. Enter( "end" , End);
  1305. Enter( "enum", Enum);
  1306. Enter( "else" , Else);
  1307. Enter( "elsif" , Elsif);
  1308. Enter( "exit" , Exit);
  1309. Enter( "extern" , Extern);
  1310. Enter( "false" , False);
  1311. Enter( "for" , For);
  1312. Enter( "finally" , Finally);
  1313. Enter( "if" , If);
  1314. Enter( "ignore" , Ignore);
  1315. Enter( "imag" , Imag);
  1316. Enter( "in" , In);
  1317. Enter( "is" , Is);
  1318. Enter( "import" , Import);
  1319. Enter( "loop" , Loop);
  1320. Enter( "module", Module);
  1321. Enter( "mod" , Mod);
  1322. Enter( "nil" , Nil );
  1323. Enter( "of" , Of);
  1324. Enter( "or" , Or);
  1325. Enter( "out", Out);
  1326. Enter( "operator" , Operator);
  1327. Enter( "procedure" , Procedure);
  1328. Enter( "port", Port);
  1329. Enter( "repeat" , Repeat);
  1330. Enter( "return" , Return);
  1331. Enter( "self", Self);
  1332. Enter( "new", New);
  1333. Enter( "result", Result);
  1334. Enter( "then" , Then);
  1335. Enter( "true" , True);
  1336. Enter( "to" , To);
  1337. Enter( "type" , Type);
  1338. Enter( "until" , Until );
  1339. Enter( "var" , Var );
  1340. Enter( "while" , While);
  1341. Enter( "with" , With);
  1342. (* types *)
  1343. Enter( "array" , Array );
  1344. Enter( "object" , Object);
  1345. Enter( "pointer" , Pointer);
  1346. Enter( "record" , Record);
  1347. Enter( "address" , Address);
  1348. Enter( "size" , Size);
  1349. Enter( "alias" , Alias);
  1350. (* tokens *)
  1351. EnterSymbol( "#", Unequal);
  1352. EnterSymbol( "&", And);
  1353. EnterSymbol( "(", LeftParenthesis);
  1354. EnterSymbol( ")", RightParenthesis);
  1355. EnterSymbol( "*", Times);
  1356. EnterSymbol( "**",TimesTimes);
  1357. EnterSymbol( "+", Plus);
  1358. EnterSymbol( "+*", PlusTimes);
  1359. EnterSymbol( ",", Comma);
  1360. EnterSymbol( "-", Minus);
  1361. EnterSymbol(".",Period );
  1362. EnterSymbol("..",Upto );
  1363. EnterSymbol(".*",DotTimes );
  1364. EnterSymbol("./",DotSlash );
  1365. EnterSymbol(".=",DotEqual );
  1366. EnterSymbol(".#",DotUnequal );
  1367. EnterSymbol(".>",DotGreater );
  1368. EnterSymbol(".>=",DotGreaterEqual );
  1369. EnterSymbol(".<", DotLess);
  1370. EnterSymbol(".<=",DotLessEqual );
  1371. EnterSymbol( "/", Slash);
  1372. EnterSymbol( ":", Colon);
  1373. EnterSymbol( ":=",Becomes);
  1374. EnterSymbol( ";", Semicolon);
  1375. EnterSymbol( "<", Less);
  1376. EnterSymbol( "<=", LessEqual);
  1377. EnterSymbol( "=", Equal);
  1378. EnterSymbol( ">", Greater);
  1379. EnterSymbol( ">=", GreaterEqual);
  1380. EnterSymbol( "[", LeftBracket);
  1381. EnterSymbol( "]", RightBracket);
  1382. EnterSymbol( "^", Arrow);
  1383. EnterSymbol( "{", LeftBrace);
  1384. EnterSymbol( "|",Bar);
  1385. EnterSymbol( "}", RightBrace);
  1386. EnterSymbol( "~", Not);
  1387. EnterSymbol( "\", Backslash);
  1388. EnterSymbol( "`", Transpose);
  1389. EnterSymbol( "?",Questionmark);
  1390. EnterSymbol( "??",Questionmarks);
  1391. EnterSymbol( "!",ExclamationMark);
  1392. EnterSymbol( "!!",ExclamationMarks);
  1393. EnterSymbol( "<<",LessLess);
  1394. EnterSymbol( "<<?",LessLessQ);
  1395. EnterSymbol( ">>",GreaterGreater);
  1396. EnterSymbol( ">>?",GreaterGreaterQ);
  1397. Basic.SetErrorMessage(Number,"missing number");
  1398. Basic.SetErrorMessage(String,"missing string");
  1399. Basic.SetErrorMessage(Character,"missing character");
  1400. Basic.SetErrorMessage(Identifier,"missing identifier");
  1401. Basic.SetErrorMessage(EndOfText,"unexpected token before end");
  1402. END InitKeywords;
  1403. (** debugging / reporting **)
  1404. PROCEDURE ReportKeywords*(context: Commands.Context);
  1405. VAR i: LONGINT; name: Keyword;
  1406. BEGIN
  1407. FOR i := 0 TO EndOfText DO
  1408. context.out.Int(i,1); context.out.String(": ");
  1409. context.out.Char('"');
  1410. keywordsLower.StringByIndex(i,name);
  1411. context.out.String(name);
  1412. context.out.Char('"');
  1413. context.out.String(", ");
  1414. context.out.Char('"');
  1415. keywordsUpper.StringByIndex(i,name);
  1416. context.out.String(name);
  1417. context.out.Char('"');
  1418. context.out.Ln;
  1419. END;
  1420. END ReportKeywords;
  1421. (*
  1422. PROCEDURE TestScanner*(context: Commands.Context);
  1423. VAR filename: ARRAY 256 OF CHAR; reader: Streams.Reader; scanner: Scanner;token: Token;
  1424. BEGIN
  1425. context.arg.SkipWhitespace; context.arg.String(filename);
  1426. reader := TextUtilities.GetTextReader(filename);
  1427. scanner := NewScanner(filename,reader,0,NIL);
  1428. REPEAT
  1429. IF scanner.GetNextToken(token) THEN
  1430. PrintToken(context.out,token);context.out.Ln;
  1431. END;
  1432. UNTIL scanner.error OR (token.symbol=EndOfText)
  1433. END TestScanner;
  1434. *)
  1435. BEGIN
  1436. InitReservedCharacters; InitSymbols; InitKeywords
  1437. END FoxScanner.
  1438. FoxScanner.ReportKeywords
  1439. FoxScanner.TestScanner Test.Mod ~