UnicodeProperties.Mod 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576
  1. MODULE UnicodeProperties; (** AUTHOR "gubsermi"; PURPOSE "Reading the Unicode.txt file and interpreting the properties"; *)
  2. IMPORT
  3. Texts, Codecs, Files, Streams, KernelLog, Strings;
  4. CONST
  5. NUL* = 00H;
  6. EOT* = 04H;
  7. LF* = 0AH;
  8. CR* = 0DH;
  9. SP* = 20H;
  10. CacheDebugging = FALSE;
  11. VAR
  12. error- : BOOLEAN;
  13. TYPE
  14. (* Caches a result from the property files. Can be used for a string or a character value, but not both! *)
  15. CacheElement = OBJECT
  16. VAR
  17. next : CacheElement;
  18. key : Texts.Char32;
  19. sValue : ARRAY 256 OF CHAR;
  20. cValue : Texts.Char32;
  21. (* Initializes a CacheElement with a key and a value *)
  22. PROCEDURE &Init*(key : Texts.Char32; CONST sValue : ARRAY OF CHAR; cValue : Texts.Char32);
  23. BEGIN
  24. SELF.key := key;
  25. IF sValue[0] # CHR(0H) THEN
  26. Strings.Copy(sValue,0,LEN(sValue),SELF.sValue);
  27. SELF.cValue := -1;
  28. ELSE
  29. SELF.sValue[0] := CHR(0H);
  30. SELF.cValue := cValue;
  31. END;
  32. END Init;
  33. END CacheElement;
  34. (* The Property Cache uses a Hashmap of a specific size to cache either string or character properties. *)
  35. CharacterPropertyCache = OBJECT
  36. VAR
  37. internalCache : POINTER TO ARRAY OF CacheElement;
  38. cacheSize : LONGINT;
  39. (* Initializes the hashmap with a specific size *)
  40. PROCEDURE &Init*(size : LONGINT);
  41. BEGIN
  42. cacheSize := size;
  43. NEW(internalCache,cacheSize);
  44. END Init;
  45. (* Searches the cache for a specific key and returns the corresponding string entry *)
  46. PROCEDURE SLookup(char : Texts.Char32; VAR res : ARRAY OF CHAR);
  47. VAR
  48. bucket : LONGINT;
  49. currentElement : CacheElement;
  50. BEGIN
  51. (* get the bucket where the element resides if available *)
  52. bucket := char MOD cacheSize;
  53. currentElement := internalCache[bucket];
  54. (* search the linked list for the entry *)
  55. WHILE currentElement # NIL DO
  56. IF currentElement.key = char THEN
  57. IF CacheDebugging THEN
  58. KernelLog.String("found: "); KernelLog.Hex(currentElement.key,4);
  59. KernelLog.String(" ("); KernelLog.String(currentElement.sValue);
  60. KernelLog.String(")"); KernelLog.Ln;
  61. END;
  62. Strings.Copy(currentElement.sValue,0,LEN(res),res);
  63. RETURN;
  64. ELSE
  65. currentElement := currentElement.next;
  66. END;
  67. END;
  68. (* clear the result if nothing was found. *)
  69. res := "";
  70. END SLookup;
  71. (* Searches the cache for a specific key and returns the corresponding character entry *)
  72. PROCEDURE CLookup(char : Texts.Char32) : Texts.Char32;
  73. VAR
  74. bucket : LONGINT;
  75. currentElement : CacheElement;
  76. BEGIN
  77. (* get the bucket where the element resides if available *)
  78. bucket := char MOD cacheSize;
  79. currentElement := internalCache[bucket];
  80. (* search the linked list for the entry *)
  81. WHILE currentElement # NIL DO
  82. IF currentElement.key = char THEN
  83. IF CacheDebugging THEN
  84. KernelLog.String("found: "); KernelLog.Hex(currentElement.key,4);
  85. KernelLog.String(" ("); KernelLog.Hex(currentElement.cValue,4);
  86. KernelLog.String(")"); KernelLog.Ln;
  87. END;
  88. RETURN currentElement.cValue;
  89. ELSE
  90. currentElement := currentElement.next;
  91. END;
  92. END;
  93. (* return a 'fault code' if nothing was found *)
  94. RETURN -1
  95. END CLookup;
  96. (* Inserts a new string entry for a given key. *)
  97. PROCEDURE SInsert(char : Texts.Char32; CONST value : ARRAY OF CHAR);
  98. VAR
  99. newElement : CacheElement;
  100. bucket : LONGINT;
  101. BEGIN
  102. NEW(newElement,char,value,-1);
  103. (* insert the new entry at the first position of the correct bucket *)
  104. bucket := char MOD cacheSize;
  105. newElement.next := internalCache[bucket];
  106. internalCache[bucket] := newElement;
  107. IF CacheDebugging THEN
  108. KernelLog.String("inserted: "); KernelLog.Hex(char,4);
  109. KernelLog.String(" (");
  110. KernelLog.String(value);
  111. KernelLog.String(")"); KernelLog.Ln;
  112. END;
  113. END SInsert;
  114. (* Inserts a new character entry for a given key. *)
  115. PROCEDURE CInsert(char : Texts.Char32; value : Texts.Char32);
  116. VAR
  117. newElement: CacheElement;
  118. bucket : LONGINT;
  119. dummy : ARRAY 1 OF CHAR;
  120. BEGIN
  121. dummy[0] := CHR(0H);
  122. NEW(newElement,char,dummy,value);
  123. (* insert the new entry at the first position of the correct bucket *)
  124. bucket := char MOD cacheSize;
  125. newElement.next := internalCache[bucket];
  126. internalCache[bucket] := newElement;
  127. IF CacheDebugging THEN
  128. KernelLog.String("inserted: "); KernelLog.Hex(char,4);
  129. KernelLog.String(" ("); KernelLog.Hex(value,4);
  130. KernelLog.String(")"); KernelLog.Ln;
  131. END;
  132. END CInsert;
  133. (* Prints the whole cache to the console *)
  134. PROCEDURE Print;
  135. VAR
  136. i : LONGINT;
  137. thisElement : CacheElement;
  138. BEGIN
  139. FOR i := 0 TO cacheSize - 1 DO
  140. thisElement := internalCache[i];
  141. KernelLog.Int(i,3); KernelLog.String(": ");
  142. WHILE thisElement # NIL DO
  143. KernelLog.Int(thisElement.key,4); KernelLog.String(" (");
  144. IF thisElement.cValue = -1 THEN KernelLog.String(thisElement.sValue) END;
  145. KernelLog.String(") -> ");
  146. thisElement := thisElement.next;
  147. END;
  148. KernelLog.Ln;
  149. END;
  150. END Print;
  151. END CharacterPropertyCache;
  152. (* A handy implementation for text file reading and analyzation. Basic functionality is provided. TxtReaders that
  153. handle a specific text layout, should inherit this class and (re-)implement necessary procedures. *)
  154. TxtReader = OBJECT
  155. VAR
  156. filename : ARRAY 256 OF CHAR;
  157. text : Texts.Text;
  158. textReader : Texts.TextReader;
  159. startPos : LONGINT;
  160. decoder : Codecs.TextDecoder;
  161. msg : ARRAY 512 OF CHAR;
  162. fullname : ARRAY 256 OF CHAR;
  163. file : Files.File;
  164. in: Streams.Reader;
  165. decoderRes : WORD;
  166. (* loads a file into a local Text and creates an associated TextReader *)
  167. PROCEDURE LoadTxtFile;
  168. BEGIN
  169. error := FALSE;
  170. COPY(filename, fullname);
  171. (* Check whether file exists and get its canonical name *)
  172. file := Files.Old(filename);
  173. IF (file # NIL) THEN
  174. file.GetName(fullname);
  175. ELSE
  176. file := Files.New(filename); (* to get path *)
  177. IF (file # NIL) THEN
  178. file.GetName(fullname);
  179. file := NIL;
  180. END;
  181. END;
  182. IF (file # NIL) THEN
  183. decoder := Codecs.GetTextDecoder("ISO8859-1");
  184. IF (decoder # NIL) THEN
  185. in := Codecs.OpenInputStream(fullname);
  186. IF in # NIL THEN
  187. decoder.Open(in, decoderRes);
  188. IF decoderRes = 0 THEN
  189. text := decoder.GetText();
  190. NEW(textReader,text);
  191. END;
  192. ELSE
  193. msg := "Can't open input stream on file "; Strings.Append(msg, fullname);
  194. KernelLog.String(msg);
  195. error := TRUE;
  196. END;
  197. ELSE
  198. msg := "No decoder for file "; Strings.Append(msg, fullname);
  199. Strings.Append(msg, " (Format: "); Strings.Append(msg, "ISO8859-1"); Strings.Append(msg, ")");
  200. KernelLog.String(msg);
  201. error := TRUE;
  202. END;
  203. ELSE
  204. msg := "file '"; Strings.Append(msg, fullname); Strings.Append(msg,"' not found.");
  205. KernelLog.String(msg);
  206. error := TRUE;
  207. END;
  208. FindStartPos;
  209. END LoadTxtFile;
  210. (* Abstract procedure to be overwritten by the children of TxtReader *)
  211. PROCEDURE FindStartPos;
  212. BEGIN
  213. HALT (999);
  214. END FindStartPos;
  215. (* Skips a whole line of the file *)
  216. PROCEDURE NextLine;
  217. VAR
  218. thisChar : Texts.Char32;
  219. BEGIN
  220. IF textReader = NIL THEN RETURN END;
  221. (* read the characters until the end of the line is reached *)
  222. REPEAT
  223. textReader.ReadCh(thisChar);
  224. UNTIL ((thisChar = LF) OR (thisChar = CR));
  225. END NextLine;
  226. END TxtReader;
  227. TYPE
  228. (* TxtReader to read the UnicodeData.txt file. So far there's direct support for the bidi character type and the
  229. 'mirrored' property. More explicit lookups can easily be added later on. *)
  230. UnicodeTxtReader*=OBJECT(TxtReader)
  231. VAR
  232. (* For each property that is explicitly needed, a cache is used. Whenever a new property is needed often,
  233. feel free to add another cache. *)
  234. charTypeCache, mirrorPropCache : CharacterPropertyCache;
  235. (* Loads the UnicodeData.txt into memory and creates the caches. *)
  236. PROCEDURE &Init*;
  237. BEGIN
  238. filename := "UnicodeData.txt";
  239. LoadTxtFile;
  240. NEW(charTypeCache,256);
  241. NEW(mirrorPropCache,256);
  242. END Init;
  243. (* The property file has no leading comments. Therefore there are no lines to be skipped *)
  244. PROCEDURE FindStartPos;
  245. BEGIN
  246. startPos := 0;
  247. END FindStartPos;
  248. (* Returns the bidirectional character type for a specific character *)
  249. PROCEDURE GetBidiCharacterType*(char : Texts.Char32; VAR res : Strings.String);
  250. VAR
  251. tempRes : ARRAY 16 OF CHAR;
  252. BEGIN
  253. (* firstly, the appropriate cache is searched for an entry of this character *)
  254. charTypeCache.SLookup(char,tempRes);
  255. (* if nothing was found the file is read and the result is added to the cache. *)
  256. IF tempRes = "" THEN
  257. GetProperty(char,4,res^);
  258. IF res^ = "" THEN
  259. res^ := "L";
  260. KernelLog.String("no character type has been found. Using 'L'"); KernelLog.Ln;
  261. END;
  262. charTypeCache.SInsert(char,res^);
  263. ELSE
  264. Strings.Copy(tempRes,0,LEN(tempRes),res^);
  265. END;
  266. END GetBidiCharacterType;
  267. (* Checks if a specific character has its 'mirrored' property set to 'yes' *)
  268. PROCEDURE IsMirroredChar*(char : Texts.Char32) : BOOLEAN;
  269. VAR
  270. res : ARRAY 16 OF CHAR;
  271. BEGIN
  272. (* firstly, the appropriate cache is searched for an entry of this character *)
  273. mirrorPropCache.SLookup(char,res);
  274. (* if nothing was found the file is read and the result is added to the cache. *)
  275. IF res = "" THEN
  276. GetProperty(char,9,res);
  277. mirrorPropCache.SInsert(char,res);
  278. END;
  279. RETURN res = "Y";
  280. END IsMirroredChar;
  281. (* Checks if the character type of a specific character is 'WS' *)
  282. PROCEDURE IsWhiteSpaceChar*(char : Texts.Char32) : BOOLEAN;
  283. VAR
  284. res : ARRAY 16 OF CHAR;
  285. BEGIN
  286. (* firstly, the appropriate cache is searched for an entry of this character *)
  287. charTypeCache.SLookup(char,res);
  288. (* if nothing was found the file is read and the result is added to the cache. *)
  289. IF res = "" THEN
  290. GetProperty(char,4,res);
  291. charTypeCache.SInsert(char,res);
  292. END;
  293. RETURN res = "WS";
  294. END IsWhiteSpaceChar;
  295. (* Gets the character's property at a certain position (0 being the character itself). *)
  296. PROCEDURE GetProperty*(char : Texts.Char32; pos : LONGINT; VAR res : ARRAY OF CHAR);
  297. VAR
  298. thisChar, thisInt : Texts.Char32;
  299. i,j : INTEGER;
  300. dummyVal : WORD;
  301. BEGIN
  302. text.AcquireRead;
  303. textReader.SetPosition(startPos);
  304. (* iterate through characters *)
  305. LOOP
  306. i := 0;
  307. (* iterate through properties *)
  308. LOOP
  309. j := 0;
  310. (* read the current property *)
  311. REPEAT
  312. textReader.ReadCh(thisChar);
  313. (* is end of file reached? *)
  314. IF (j = 0) & ((thisChar = EOT) OR (thisChar = NUL)) THEN
  315. res[j] := CHR(0H);
  316. text.ReleaseRead;
  317. RETURN;
  318. END;
  319. (* store the string if its the character's coded or the wanted property *)
  320. IF (i = pos) OR (i = 0) THEN
  321. res[j] := CHR(thisChar);
  322. END;
  323. INC(j);
  324. UNTIL (thisChar = ORD(';')) OR (thisChar = CR) OR (thisChar = LF);
  325. (* the property has been found *)
  326. IF (i = pos) THEN
  327. res[j-1] := CHR(0H);
  328. text.ReleaseRead;
  329. RETURN;
  330. (* the character's code has been found *)
  331. ELSIF (i = 0) THEN
  332. res[j-1] := CHR(0H);
  333. Strings.HexStrToInt(res,thisInt, dummyVal);
  334. (* carry on if the this was not the wanted character yet *)
  335. IF (thisInt < char) THEN
  336. EXIT;
  337. (* return if the wanted character has already been passed *)
  338. ELSIF (thisInt > char) THEN
  339. res[0] := CHR(0H);
  340. text.ReleaseRead;
  341. RETURN;
  342. END;
  343. (* return if the wanted property has already been passed *)
  344. ELSIF (i > pos) THEN
  345. res[0] := CHR(0H);
  346. text.ReleaseRead;
  347. RETURN;
  348. END;
  349. (* carry on if this was the last property of the line *)
  350. IF (thisChar = CR) OR (thisChar = LF) THEN
  351. EXIT;
  352. END;
  353. INC(i);
  354. END;
  355. NextLine;
  356. END;
  357. text.ReleaseRead;
  358. END GetProperty;
  359. (* Exported procedure to print the character type cache *)
  360. PROCEDURE PrintCharTypeCache*;
  361. BEGIN
  362. charTypeCache.Print;
  363. END PrintCharTypeCache;
  364. END UnicodeTxtReader;
  365. TYPE
  366. (* TxtReader to read the BidiMirroring.txt file. *)
  367. BidiMirroringTxtReader*=OBJECT(TxtReader)
  368. VAR
  369. mirrorCache : CharacterPropertyCache;
  370. (* Loads the BidiMirroring.txt into memory *)
  371. PROCEDURE &Init*;
  372. BEGIN
  373. filename := "BidiMirroring.txt";
  374. LoadTxtFile;
  375. NEW(mirrorCache,256);
  376. END Init;
  377. (* Finds the start position of the relevant data. The mirroring file has a large comment at the beginning,
  378. so the scanner needs to be set to the first line of interest. *)
  379. PROCEDURE FindStartPos;
  380. VAR
  381. thisChar : LONGINT;
  382. BEGIN
  383. thisChar := 0;
  384. text.AcquireRead;
  385. (* read the line's first character and skip the line if it's a '#' *)
  386. textReader.ReadCh(thisChar);
  387. WHILE (thisChar = ORD('#')) DO
  388. NextLine;
  389. textReader.ReadCh(thisChar);
  390. END;
  391. (* store the start position *)
  392. startPos := textReader.GetPosition();
  393. text.ReleaseRead;
  394. END FindStartPos;
  395. (* Reads the next source character. The procedure assumes the scanner to be at the beginning of the line. *)
  396. PROCEDURE GetSourceChar() : Texts.Char32;
  397. VAR
  398. sourceString : ARRAY 7 OF CHAR;
  399. sourceInt, tempChar : Texts.Char32;
  400. i : INTEGER;
  401. res : WORD;
  402. BEGIN
  403. sourceInt := -1;
  404. i := -1;
  405. (* read the characters that form the code for the source character *)
  406. REPEAT
  407. INC(i);
  408. textReader.ReadCh(tempChar);
  409. sourceString[i] := CHR(tempChar);
  410. UNTIL (tempChar = EOT) OR (tempChar = ORD('#')) OR (tempChar = ORD(';'));
  411. (* if the character was terminated by a ';' it is assumed to be valid and is converted into an integer *)
  412. IF (tempChar = ORD(';')) THEN
  413. sourceString[i] := CHR(0H);
  414. Strings.HexStrToInt(sourceString,sourceInt,res);
  415. END;
  416. RETURN sourceInt;
  417. END GetSourceChar;
  418. (* Reads the next target character. The procedure assumes the scanner to have already read the source character
  419. and to be now at the beginning of the target character's code. Additionally it assumes the text to be locked. *)
  420. PROCEDURE GetTargetChar() : Texts.Char32;
  421. VAR
  422. targetString : ARRAY 7 OF CHAR;
  423. targetInt, tempChar : Texts.Char32;
  424. i : INTEGER;
  425. res : WORD;
  426. BEGIN
  427. targetInt := -1;
  428. i := -1;
  429. (* read the whitespace *)
  430. textReader.ReadCh(tempChar);
  431. (* read the characters that form the code for the target character *)
  432. REPEAT
  433. INC(i);
  434. textReader.ReadCh(tempChar);
  435. targetString[i] := CHR(tempChar);
  436. UNTIL (tempChar = EOT) OR (tempChar = ORD('#')) OR (tempChar = SP);
  437. (* terminate the result string and convert it into an integer *)
  438. targetString[i] := CHR(0H);
  439. Strings.HexStrToInt(targetString,targetInt,res);
  440. RETURN targetInt;
  441. END GetTargetChar;
  442. (* Searches the mirror file for a given character and returns its counterpart if found. *)
  443. PROCEDURE GetMirroredChar*(char : Texts.Char32) : Texts.Char32;
  444. VAR
  445. sChar : Texts.Char32;
  446. BEGIN
  447. (* look in the cache first *)
  448. sChar := mirrorCache.CLookup(char);
  449. IF sChar = -1 THEN
  450. text.AcquireRead;
  451. (* search the right source character *)
  452. textReader.SetPosition(startPos);
  453. REPEAT
  454. sChar := GetSourceChar();
  455. IF (sChar # char) THEN
  456. NextLine;
  457. END;
  458. UNTIL (sChar = char) OR (sChar = -1); (* if the char is found or if the end of chars is reached, jump out of the loop *)
  459. (* return Null if the source character could not be found *)
  460. IF (sChar = -1) THEN
  461. text.ReleaseRead;
  462. RETURN 0;
  463. ELSE
  464. (* get the target character, store it in the cache and return it *)
  465. sChar := GetTargetChar();
  466. mirrorCache.CInsert(char,sChar);
  467. text.ReleaseRead;
  468. RETURN sChar;
  469. END;
  470. ELSE
  471. RETURN sChar;
  472. END;
  473. END GetMirroredChar;
  474. END BidiMirroringTxtReader;
  475. END UnicodeProperties.
  476. System.Free UnicodeProperties ~
  477. UnicodeProperties.TestIsMirroredChar 00000028H ~
  478. PC0.Compile UnicodeProperties.Mod ~