123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576 |
- MODULE UnicodeProperties; (** AUTHOR "gubsermi"; PURPOSE "Reading the Unicode.txt file and interpreting the properties"; *)
- IMPORT
- Texts, Codecs, Files, Streams, KernelLog, Strings;
- CONST
- NUL* = 00H;
- EOT* = 04H;
- LF* = 0AH;
- CR* = 0DH;
- SP* = 20H;
- CacheDebugging = FALSE;
- VAR
- error- : BOOLEAN;
- TYPE
- (* Caches a result from the property files. Can be used for a string or a character value, but not both! *)
- CacheElement = OBJECT
- VAR
- next : CacheElement;
- key : Texts.Char32;
- sValue : ARRAY 256 OF CHAR;
- cValue : Texts.Char32;
- (* Initializes a CacheElement with a key and a value *)
- PROCEDURE &Init*(key : Texts.Char32; CONST sValue : ARRAY OF CHAR; cValue : Texts.Char32);
- BEGIN
- SELF.key := key;
- IF sValue[0] # CHR(0H) THEN
- Strings.Copy(sValue,0,LEN(sValue),SELF.sValue);
- SELF.cValue := -1;
- ELSE
- SELF.sValue[0] := CHR(0H);
- SELF.cValue := cValue;
- END;
- END Init;
- END CacheElement;
- (* The Property Cache uses a Hashmap of a specific size to cache either string or character properties. *)
- CharacterPropertyCache = OBJECT
- VAR
- internalCache : POINTER TO ARRAY OF CacheElement;
- cacheSize : LONGINT;
- (* Initializes the hashmap with a specific size *)
- PROCEDURE &Init*(size : LONGINT);
- BEGIN
- cacheSize := size;
- NEW(internalCache,cacheSize);
- END Init;
- (* Searches the cache for a specific key and returns the corresponding string entry *)
- PROCEDURE SLookup(char : Texts.Char32; VAR res : ARRAY OF CHAR);
- VAR
- bucket : LONGINT;
- currentElement : CacheElement;
- BEGIN
- (* get the bucket where the element resides if available *)
- bucket := char MOD cacheSize;
- currentElement := internalCache[bucket];
- (* search the linked list for the entry *)
- WHILE currentElement # NIL DO
- IF currentElement.key = char THEN
- IF CacheDebugging THEN
- KernelLog.String("found: "); KernelLog.Hex(currentElement.key,4);
- KernelLog.String(" ("); KernelLog.String(currentElement.sValue);
- KernelLog.String(")"); KernelLog.Ln;
- END;
- Strings.Copy(currentElement.sValue,0,LEN(res),res);
- RETURN;
- ELSE
- currentElement := currentElement.next;
- END;
- END;
- (* clear the result if nothing was found. *)
- res := "";
- END SLookup;
- (* Searches the cache for a specific key and returns the corresponding character entry *)
- PROCEDURE CLookup(char : Texts.Char32) : Texts.Char32;
- VAR
- bucket : LONGINT;
- currentElement : CacheElement;
- BEGIN
- (* get the bucket where the element resides if available *)
- bucket := char MOD cacheSize;
- currentElement := internalCache[bucket];
- (* search the linked list for the entry *)
- WHILE currentElement # NIL DO
- IF currentElement.key = char THEN
- IF CacheDebugging THEN
- KernelLog.String("found: "); KernelLog.Hex(currentElement.key,4);
- KernelLog.String(" ("); KernelLog.Hex(currentElement.cValue,4);
- KernelLog.String(")"); KernelLog.Ln;
- END;
- RETURN currentElement.cValue;
- ELSE
- currentElement := currentElement.next;
- END;
- END;
- (* return a 'fault code' if nothing was found *)
- RETURN -1
- END CLookup;
- (* Inserts a new string entry for a given key. *)
- PROCEDURE SInsert(char : Texts.Char32; CONST value : ARRAY OF CHAR);
- VAR
- newElement : CacheElement;
- bucket : LONGINT;
- BEGIN
- NEW(newElement,char,value,-1);
- (* insert the new entry at the first position of the correct bucket *)
- bucket := char MOD cacheSize;
- newElement.next := internalCache[bucket];
- internalCache[bucket] := newElement;
- IF CacheDebugging THEN
- KernelLog.String("inserted: "); KernelLog.Hex(char,4);
- KernelLog.String(" (");
- KernelLog.String(value);
- KernelLog.String(")"); KernelLog.Ln;
- END;
- END SInsert;
- (* Inserts a new character entry for a given key. *)
- PROCEDURE CInsert(char : Texts.Char32; value : Texts.Char32);
- VAR
- newElement: CacheElement;
- bucket : LONGINT;
- dummy : ARRAY 1 OF CHAR;
- BEGIN
- dummy[0] := CHR(0H);
- NEW(newElement,char,dummy,value);
- (* insert the new entry at the first position of the correct bucket *)
- bucket := char MOD cacheSize;
- newElement.next := internalCache[bucket];
- internalCache[bucket] := newElement;
- IF CacheDebugging THEN
- KernelLog.String("inserted: "); KernelLog.Hex(char,4);
- KernelLog.String(" ("); KernelLog.Hex(value,4);
- KernelLog.String(")"); KernelLog.Ln;
- END;
- END CInsert;
- (* Prints the whole cache to the console *)
- PROCEDURE Print;
- VAR
- i : LONGINT;
- thisElement : CacheElement;
- BEGIN
- FOR i := 0 TO cacheSize - 1 DO
- thisElement := internalCache[i];
- KernelLog.Int(i,3); KernelLog.String(": ");
- WHILE thisElement # NIL DO
- KernelLog.Int(thisElement.key,4); KernelLog.String(" (");
- IF thisElement.cValue = -1 THEN KernelLog.String(thisElement.sValue) END;
- KernelLog.String(") -> ");
- thisElement := thisElement.next;
- END;
- KernelLog.Ln;
- END;
- END Print;
- END CharacterPropertyCache;
- (* A handy implementation for text file reading and analyzation. Basic functionality is provided. TxtReaders that
- handle a specific text layout, should inherit this class and (re-)implement necessary procedures. *)
- TxtReader = OBJECT
- VAR
- filename : ARRAY 256 OF CHAR;
- text : Texts.Text;
- textReader : Texts.TextReader;
- startPos : LONGINT;
- decoder : Codecs.TextDecoder;
- msg : ARRAY 512 OF CHAR;
- fullname : ARRAY 256 OF CHAR;
- file : Files.File;
- in: Streams.Reader;
- decoderRes : WORD;
- (* loads a file into a local Text and creates an associated TextReader *)
- PROCEDURE LoadTxtFile;
- BEGIN
- error := FALSE;
- COPY(filename, fullname);
- (* Check whether file exists and get its canonical name *)
- file := Files.Old(filename);
- IF (file # NIL) THEN
- file.GetName(fullname);
- ELSE
- file := Files.New(filename); (* to get path *)
- IF (file # NIL) THEN
- file.GetName(fullname);
- file := NIL;
- END;
- END;
- IF (file # NIL) THEN
- decoder := Codecs.GetTextDecoder("ISO8859-1");
- IF (decoder # NIL) THEN
- in := Codecs.OpenInputStream(fullname);
- IF in # NIL THEN
- decoder.Open(in, decoderRes);
- IF decoderRes = 0 THEN
- text := decoder.GetText();
- NEW(textReader,text);
- END;
- ELSE
- msg := "Can't open input stream on file "; Strings.Append(msg, fullname);
- KernelLog.String(msg);
- error := TRUE;
- END;
- ELSE
- msg := "No decoder for file "; Strings.Append(msg, fullname);
- Strings.Append(msg, " (Format: "); Strings.Append(msg, "ISO8859-1"); Strings.Append(msg, ")");
- KernelLog.String(msg);
- error := TRUE;
- END;
- ELSE
- msg := "file '"; Strings.Append(msg, fullname); Strings.Append(msg,"' not found.");
- KernelLog.String(msg);
- error := TRUE;
- END;
- FindStartPos;
- END LoadTxtFile;
- (* Abstract procedure to be overwritten by the children of TxtReader *)
- PROCEDURE FindStartPos;
- BEGIN
- HALT (999);
- END FindStartPos;
- (* Skips a whole line of the file *)
- PROCEDURE NextLine;
- VAR
- thisChar : Texts.Char32;
- BEGIN
- IF textReader = NIL THEN RETURN END;
- (* read the characters until the end of the line is reached *)
- REPEAT
- textReader.ReadCh(thisChar);
- UNTIL ((thisChar = LF) OR (thisChar = CR));
- END NextLine;
- END TxtReader;
- TYPE
- (* TxtReader to read the UnicodeData.txt file. So far there's direct support for the bidi character type and the
- 'mirrored' property. More explicit lookups can easily be added later on. *)
- UnicodeTxtReader*=OBJECT(TxtReader)
- VAR
- (* For each property that is explicitly needed, a cache is used. Whenever a new property is needed often,
- feel free to add another cache. *)
- charTypeCache, mirrorPropCache : CharacterPropertyCache;
- (* Loads the UnicodeData.txt into memory and creates the caches. *)
- PROCEDURE &Init*;
- BEGIN
- filename := "UnicodeData.txt";
- LoadTxtFile;
- NEW(charTypeCache,256);
- NEW(mirrorPropCache,256);
- END Init;
- (* The property file has no leading comments. Therefore there are no lines to be skipped *)
- PROCEDURE FindStartPos;
- BEGIN
- startPos := 0;
- END FindStartPos;
- (* Returns the bidirectional character type for a specific character *)
- PROCEDURE GetBidiCharacterType*(char : Texts.Char32; VAR res : Strings.String);
- VAR
- tempRes : ARRAY 16 OF CHAR;
- BEGIN
- (* firstly, the appropriate cache is searched for an entry of this character *)
- charTypeCache.SLookup(char,tempRes);
- (* if nothing was found the file is read and the result is added to the cache. *)
- IF tempRes = "" THEN
- GetProperty(char,4,res^);
- IF res^ = "" THEN
- res^ := "L";
- KernelLog.String("no character type has been found. Using 'L'"); KernelLog.Ln;
- END;
- charTypeCache.SInsert(char,res^);
- ELSE
- Strings.Copy(tempRes,0,LEN(tempRes),res^);
- END;
- END GetBidiCharacterType;
- (* Checks if a specific character has its 'mirrored' property set to 'yes' *)
- PROCEDURE IsMirroredChar*(char : Texts.Char32) : BOOLEAN;
- VAR
- res : ARRAY 16 OF CHAR;
- BEGIN
- (* firstly, the appropriate cache is searched for an entry of this character *)
- mirrorPropCache.SLookup(char,res);
- (* if nothing was found the file is read and the result is added to the cache. *)
- IF res = "" THEN
- GetProperty(char,9,res);
- mirrorPropCache.SInsert(char,res);
- END;
- RETURN res = "Y";
- END IsMirroredChar;
- (* Checks if the character type of a specific character is 'WS' *)
- PROCEDURE IsWhiteSpaceChar*(char : Texts.Char32) : BOOLEAN;
- VAR
- res : ARRAY 16 OF CHAR;
- BEGIN
- (* firstly, the appropriate cache is searched for an entry of this character *)
- charTypeCache.SLookup(char,res);
- (* if nothing was found the file is read and the result is added to the cache. *)
- IF res = "" THEN
- GetProperty(char,4,res);
- charTypeCache.SInsert(char,res);
- END;
- RETURN res = "WS";
- END IsWhiteSpaceChar;
- (* Gets the character's property at a certain position (0 being the character itself). *)
- PROCEDURE GetProperty*(char : Texts.Char32; pos : LONGINT; VAR res : ARRAY OF CHAR);
- VAR
- thisChar, thisInt : Texts.Char32;
- i,j : INTEGER;
- dummyVal : WORD;
- BEGIN
- text.AcquireRead;
- textReader.SetPosition(startPos);
- (* iterate through characters *)
- LOOP
- i := 0;
- (* iterate through properties *)
- LOOP
- j := 0;
- (* read the current property *)
- REPEAT
- textReader.ReadCh(thisChar);
- (* is end of file reached? *)
- IF (j = 0) & ((thisChar = EOT) OR (thisChar = NUL)) THEN
- res[j] := CHR(0H);
- text.ReleaseRead;
- RETURN;
- END;
- (* store the string if its the character's coded or the wanted property *)
- IF (i = pos) OR (i = 0) THEN
- res[j] := CHR(thisChar);
- END;
- INC(j);
- UNTIL (thisChar = ORD(';')) OR (thisChar = CR) OR (thisChar = LF);
- (* the property has been found *)
- IF (i = pos) THEN
- res[j-1] := CHR(0H);
- text.ReleaseRead;
- RETURN;
- (* the character's code has been found *)
- ELSIF (i = 0) THEN
- res[j-1] := CHR(0H);
- Strings.HexStrToInt(res,thisInt, dummyVal);
- (* carry on if the this was not the wanted character yet *)
- IF (thisInt < char) THEN
- EXIT;
- (* return if the wanted character has already been passed *)
- ELSIF (thisInt > char) THEN
- res[0] := CHR(0H);
- text.ReleaseRead;
- RETURN;
- END;
- (* return if the wanted property has already been passed *)
- ELSIF (i > pos) THEN
- res[0] := CHR(0H);
- text.ReleaseRead;
- RETURN;
- END;
- (* carry on if this was the last property of the line *)
- IF (thisChar = CR) OR (thisChar = LF) THEN
- EXIT;
- END;
- INC(i);
- END;
- NextLine;
- END;
- text.ReleaseRead;
- END GetProperty;
- (* Exported procedure to print the character type cache *)
- PROCEDURE PrintCharTypeCache*;
- BEGIN
- charTypeCache.Print;
- END PrintCharTypeCache;
- END UnicodeTxtReader;
- TYPE
- (* TxtReader to read the BidiMirroring.txt file. *)
- BidiMirroringTxtReader*=OBJECT(TxtReader)
- VAR
- mirrorCache : CharacterPropertyCache;
- (* Loads the BidiMirroring.txt into memory *)
- PROCEDURE &Init*;
- BEGIN
- filename := "BidiMirroring.txt";
- LoadTxtFile;
- NEW(mirrorCache,256);
- END Init;
- (* Finds the start position of the relevant data. The mirroring file has a large comment at the beginning,
- so the scanner needs to be set to the first line of interest. *)
- PROCEDURE FindStartPos;
- VAR
- thisChar : LONGINT;
- BEGIN
- thisChar := 0;
- text.AcquireRead;
- (* read the line's first character and skip the line if it's a '#' *)
- textReader.ReadCh(thisChar);
- WHILE (thisChar = ORD('#')) DO
- NextLine;
- textReader.ReadCh(thisChar);
- END;
- (* store the start position *)
- startPos := textReader.GetPosition();
- text.ReleaseRead;
- END FindStartPos;
- (* Reads the next source character. The procedure assumes the scanner to be at the beginning of the line. *)
- PROCEDURE GetSourceChar() : Texts.Char32;
- VAR
- sourceString : ARRAY 7 OF CHAR;
- sourceInt, tempChar : Texts.Char32;
- i : INTEGER;
- res : WORD;
- BEGIN
- sourceInt := -1;
- i := -1;
- (* read the characters that form the code for the source character *)
- REPEAT
- INC(i);
- textReader.ReadCh(tempChar);
- sourceString[i] := CHR(tempChar);
- UNTIL (tempChar = EOT) OR (tempChar = ORD('#')) OR (tempChar = ORD(';'));
- (* if the character was terminated by a ';' it is assumed to be valid and is converted into an integer *)
- IF (tempChar = ORD(';')) THEN
- sourceString[i] := CHR(0H);
- Strings.HexStrToInt(sourceString,sourceInt,res);
- END;
- RETURN sourceInt;
- END GetSourceChar;
- (* Reads the next target character. The procedure assumes the scanner to have already read the source character
- and to be now at the beginning of the target character's code. Additionally it assumes the text to be locked. *)
- PROCEDURE GetTargetChar() : Texts.Char32;
- VAR
- targetString : ARRAY 7 OF CHAR;
- targetInt, tempChar : Texts.Char32;
- i : INTEGER;
- res : WORD;
- BEGIN
- targetInt := -1;
- i := -1;
- (* read the whitespace *)
- textReader.ReadCh(tempChar);
- (* read the characters that form the code for the target character *)
- REPEAT
- INC(i);
- textReader.ReadCh(tempChar);
- targetString[i] := CHR(tempChar);
- UNTIL (tempChar = EOT) OR (tempChar = ORD('#')) OR (tempChar = SP);
- (* terminate the result string and convert it into an integer *)
- targetString[i] := CHR(0H);
- Strings.HexStrToInt(targetString,targetInt,res);
- RETURN targetInt;
- END GetTargetChar;
- (* Searches the mirror file for a given character and returns its counterpart if found. *)
- PROCEDURE GetMirroredChar*(char : Texts.Char32) : Texts.Char32;
- VAR
- sChar : Texts.Char32;
- BEGIN
- (* look in the cache first *)
- sChar := mirrorCache.CLookup(char);
- IF sChar = -1 THEN
- text.AcquireRead;
- (* search the right source character *)
- textReader.SetPosition(startPos);
- REPEAT
- sChar := GetSourceChar();
- IF (sChar # char) THEN
- NextLine;
- END;
- UNTIL (sChar = char) OR (sChar = -1); (* if the char is found or if the end of chars is reached, jump out of the loop *)
- (* return Null if the source character could not be found *)
- IF (sChar = -1) THEN
- text.ReleaseRead;
- RETURN 0;
- ELSE
- (* get the target character, store it in the cache and return it *)
- sChar := GetTargetChar();
- mirrorCache.CInsert(char,sChar);
- text.ReleaseRead;
- RETURN sChar;
- END;
- ELSE
- RETURN sChar;
- END;
- END GetMirroredChar;
- END BidiMirroringTxtReader;
- END UnicodeProperties.
- System.Free UnicodeProperties ~
- UnicodeProperties.TestIsMirroredChar 00000028H ~
- PC0.Compile UnicodeProperties.Mod ~
|