Difference between revisions of "UTF-8"

From Second Life Wiki
Jump to: navigation, search
m (<lsl> tag to <source>)
 
(27 intermediate revisions by 7 users not shown)
Line 1: Line 1:
SL uses UTF-8 for storing and transmitting strings and with these functions you can work with Unicode characters.
+
{{LSL Header}}{{LSLC|User-Defined_Functions}}
 +
Second Life uses {{Wikipedia|UTF-8|UTF-8}} for storing and transmitting [[String|strings]] and with these functions you can work with {{Wikipedia|Unicode|Unicode}} characters. See: [[Unicode In 5 Minutes]] for a brief introduction to {{Wikipedia|Unicode|Unicode}}.
  
These functions are part of the Combined Library written by {{User|Strife Onizuka}}.
 
  
== Mono Unsafe==
+
These functions are part of the [[Combined Library]] written by {{User|Strife Onizuka}}.
These versions won't work in Mono but will in LSO.
+
 
<pre>
+
== Limits under Mono ==
//===================================================//
+
 
 +
One of the side effects of using the Mono VM is that the string encoding used is not the archaic version of UTF-8 that the LSO VM uses, but instead is a much later version of UTF-16. The important difference is that some ranges of values that could be encoded in the old UTF-8 are not well supported in Mono's UTF-16 implementation. This means some values that worked with these functions in LSO, will not work under Mono.
 +
 
 +
Unless you are using these functions to pack data in strings you are unlikely to ever need to know or worry about this.
 +
 
 +
Some background: Unicode predominately contains simple characters. However some Unicode values do not represent characters but have special meaning within the Unicode Specification. Under Mono these results in one or more question marks ("?"). UnicodeIntegerToUTF8 and UTF8ToUnicodeInteger work perfectly for the simple characters; however not so well for the other values. More on these ranges can be found in the [http://unicode.org Unicode] specification and [http://en.wikipedia.org/wiki/Specials_%28Unicode_block%29 Specials (Unicode block)]. While LSO doesn't care about these limitations, Mono does.
 +
 
 +
The following integer ranges are where UTF8ToUnicodeInteger() and UnicodeIntegerToUTF8() are NOT inverse functions of each other under Mono.
 +
Converting these integers to utf8 and back to integer may likely result in the integer 63 (the Unicode codepoint for "?"), no longer guaranteeing uniqueness:
 +
 
 +
* [55296..57343] (2048 values). This corresponds to the surrogate area U+D800-U+DFFF, which is a reserved range because it's used by UTF-16 to represent the code points above U+FFFF.
 +
* [65534] U+FFFE is in fact THE intentionally "INVALID CHARACTER" Unicode code point. It is invalid by design.
 +
* [1114112 and above] can't be encoded because the Unicode range stops at U+10FFFF.
 +
 
 +
== Standard ==
 +
 
 +
This version of UnicodeIntegerToUTF8 complies to the latest standard. [[LSO]] on the other hand complies to an earlier standard. The newer standard includes only a subset of the older standard. The extended range of the old standard went unused so this incompleteness is moot.
 +
 
 +
<source lang="lsl2">string UnicodeIntegerToUTF8(integer input)//Mono Safe, LSLEditor Safe, LSO Incomplete
 +
{//LSO allows for the older UTF-8 range, this function only supports the new UTF-16 range.
 +
    if(input > 0)
 +
    {
 +
        if(input <= 0x7FF)
 +
        {//instead of a flat if else chain, this redistributes the fork load so that only the 4 byte characters result in 3 forks, all the other paths are 2 forks.
 +
            if(input <= 0x7F){
 +
                input = input << 24;
 +
                jump quick_return;//saves us from the implicit double jump that using an else would cause.
 +
            }
 +
            input = 0xC0800000 | ((input << 18) & 0x1F000000) | ((input << 16) & 0x3F0000);
 +
        }
 +
        else if(input <= 0xFFFF)
 +
            input = 0xE0808000 | ((input << 12) & 0x0F000000) | ((input << 10) & 0x3F0000) | ((input << 8) & 0x3F00);
 +
        else if(input <= 0x10FFFF)
 +
            input = 0xF0808080 | ((input << 06) & 0x07000000) | ((input << 04) & 0x3F0000) | ((input << 2) & 0x3F00) | (input & 0x3F);
 +
        else
 +
            jump error;//not in our range
 +
        @quick_return;
 +
        return llBase64ToString(llIntegerToBase64(input));
 +
    }
 +
    @error;
 +
    return "";
 +
}</source>
 +
 
 +
== General Use ==
 +
 
 +
This version will work fine in [[LSO]] and [[Mono]] but not in LSLEditor.
 +
 
 +
<source lang="lsl2">//===================================================//
 
//                Combined Library                  //
 
//                Combined Library                  //
//            "Nov 3 2007", "00:51:16"            //
+
//            "Feb 4 2008", "08:35:00"            //
//  Copyright (C) 2004-2007, Strife Onizuka (cc-by)  //
+
//  Copyright (C) 2004-2008, Strife Onizuka (cc-by)  //
 
//    http://creativecommons.org/licenses/by/3.0/    //
 
//    http://creativecommons.org/licenses/by/3.0/    //
 
//===================================================//
 
//===================================================//
 
//{
 
//{
  
integer UTF8ToUnicodeInteger(string input)//Mono Unsafe, LSO Safe
+
integer UTF8ToUnicodeInteger(string input)//LSLEditor Unsafe, LSO Safe
 
{
 
{
 
     integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
 
     integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
Line 30: Line 77:
 
}
 
}
  
string UnicodeIntegerToUTF8(integer input)//Mono Unsafe, LSO Safe
+
string UnicodeIntegerToUTF8(integer input)//LSLEditor Unsafe, LSO Safe
 
{
 
{
     integer bytes = llCeil(llLog(input) / 0.69314718055994530941723212145818);
+
     integer bytes = llCeil((llLog(input) / 0.69314718055994530941723212145818));
 
     string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !(bytes = ((input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5))));
 
     string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !(bytes = ((input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5))));
 
     while (bytes)
 
     while (bytes)
Line 39: Line 86:
 
}
 
}
  
string byte2hex(integer x)//Mono Unsafe, LSO Safe
+
string byte2hex(integer x)//LSLEditor Safe, LSO Safe
 
{//Helper function for use with unicode characters.
 
{//Helper function for use with unicode characters.
     return llGetSubString(hexc, x = ((x >> 4) & 0xF), x) + llGetSubString(hexc, x & 0xF, x & 0xF);
+
     integer y = (x >> 4) & 0xF;
}//This function would benifit greatly from the DUP opcode, it would remove 19 bytes.
+
    return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);
 +
}//This function would benefit greatly from the DUP opcode, it would remove 19 bytes.
  
 
string hexc="0123456789ABCDEF";
 
string hexc="0123456789ABCDEF";
  
//} Combined Library
+
//} Combined Library</source>
</pre>
+
  
==Mono Safe==
+
==LSLEditor Safe==
These versions are Mono and LSO safe but will not execute as quickly in LSO as the Mono Unsafe versions
+
 
<pre>
+
This version will work in [[Mono]], [[LSO]] & LSLEditor. There will be a slight performance hit in [[LSO]] as compared to the LSLEditor Unsafe version.
//===================================================//
+
 
 +
<source lang="lsl2">//===================================================//
 
//                Combined Library                  //
 
//                Combined Library                  //
//            "Nov 3 2007", "00:46:15"            //
+
//            "Feb 4 2008", "08:38:13"            //
//  Copyright (C) 2004-2007, Strife Onizuka (cc-by)  //
+
//  Copyright (C) 2004-2008, Strife Onizuka (cc-by)  //
 
//    http://creativecommons.org/licenses/by/3.0/    //
 
//    http://creativecommons.org/licenses/by/3.0/    //
 
//===================================================//
 
//===================================================//
 
//{
 
//{
  
integer UTF8ToUnicodeInteger(string input)//Mono Safe, LSO Safe
+
integer UTF8ToUnicodeInteger(string input)//LSLEditor Safe, LSO Safe
 
{
 
{
 
     integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
 
     integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
Line 71: Line 119:
 
                     (( 0x3f000000 &  end) >> 6 ) |
 
                     (( 0x3f000000 &  end) >> 6 ) |
 
                     (( 0x0000003f &  begin) << 24) |
 
                     (( 0x0000003f &  begin) << 24) |
                     (( 0x00000100 &  begin) << 22) ) &  
+
                     (( 0x00000100 &  begin) << 22)
                    (0x7FFFFFFF >> (5 * ((integer)(llLog(~result) / 0.69314718055994530941723212145818) - 25)));
+
                ) & (0x7FFFFFFF >> (5 * ((integer)(llLog(~result) / 0.69314718055994530941723212145818) - 25)));
 
     }
 
     }
 
     return result >> 24;
 
     return result >> 24;
 
}
 
}
  
string UnicodeIntegerToUTF8(integer input)//Mono Safe, LSO Safe
+
string UnicodeIntegerToUTF8(integer input)//LSLEditor Safe, LSO Safe
 
{
 
{
     integer bytes = llCeil(llLog(input) / 0.69314718055994530941723212145818);
+
     integer bytes = llCeil((llLog(input) / 0.69314718055994530941723212145818));
 
     bytes = (input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5;//adjust
 
     bytes = (input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5;//adjust
 
     string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !bytes));
 
     string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !bytes));
Line 87: Line 135:
 
}
 
}
  
string byte2hex(integer x)//Mono Safe, LSO Safe
+
string byte2hex(integer x)//LSLEditor Safe, LSO Safe
 
{//Helper function for use with unicode characters.
 
{//Helper function for use with unicode characters.
 
     integer y = (x >> 4) & 0xF;
 
     integer y = (x >> 4) & 0xF;
 
     return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);
 
     return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);
}//This function would benifit greatly from the DUP opcode, it would remove 19 bytes.
+
}//This function would benefit greatly from the DUP opcode, it would remove 19 bytes.
  
 
string hexc="0123456789ABCDEF";
 
string hexc="0123456789ABCDEF";
  
//} Combined Library
+
//} Combined Library</source>
</pre>
+

Latest revision as of 14:48, 22 January 2015

Second Life uses UTF-8 for storing and transmitting strings and with these functions you can work with Unicode characters. See: Unicode In 5 Minutes for a brief introduction to Unicode.


These functions are part of the Combined Library written by Strife Onizuka.

Limits under Mono

One of the side effects of using the Mono VM is that the string encoding used is not the archaic version of UTF-8 that the LSO VM uses, but instead is a much later version of UTF-16. The important difference is that some ranges of values that could be encoded in the old UTF-8 are not well supported in Mono's UTF-16 implementation. This means some values that worked with these functions in LSO, will not work under Mono.

Unless you are using these functions to pack data in strings you are unlikely to ever need to know or worry about this.

Some background: Unicode predominately contains simple characters. However some Unicode values do not represent characters but have special meaning within the Unicode Specification. Under Mono these results in one or more question marks ("?"). UnicodeIntegerToUTF8 and UTF8ToUnicodeInteger work perfectly for the simple characters; however not so well for the other values. More on these ranges can be found in the Unicode specification and Specials (Unicode block). While LSO doesn't care about these limitations, Mono does.

The following integer ranges are where UTF8ToUnicodeInteger() and UnicodeIntegerToUTF8() are NOT inverse functions of each other under Mono. Converting these integers to utf8 and back to integer may likely result in the integer 63 (the Unicode codepoint for "?"), no longer guaranteeing uniqueness:

  • [55296..57343] (2048 values). This corresponds to the surrogate area U+D800-U+DFFF, which is a reserved range because it's used by UTF-16 to represent the code points above U+FFFF.
  • [65534] U+FFFE is in fact THE intentionally "INVALID CHARACTER" Unicode code point. It is invalid by design.
  • [1114112 and above] can't be encoded because the Unicode range stops at U+10FFFF.

Standard

This version of UnicodeIntegerToUTF8 complies to the latest standard. LSO on the other hand complies to an earlier standard. The newer standard includes only a subset of the older standard. The extended range of the old standard went unused so this incompleteness is moot.

string UnicodeIntegerToUTF8(integer input)//Mono Safe, LSLEditor Safe, LSO Incomplete
{//LSO allows for the older UTF-8 range, this function only supports the new UTF-16 range.
    if(input > 0)
    {
        if(input <= 0x7FF)
        {//instead of a flat if else chain, this redistributes the fork load so that only the 4 byte characters result in 3 forks, all the other paths are 2 forks.
            if(input <= 0x7F){
                input = input << 24;
                jump quick_return;//saves us from the implicit double jump that using an else would cause.
            }
            input = 0xC0800000 | ((input << 18) & 0x1F000000) | ((input << 16) & 0x3F0000);
        }
        else if(input <= 0xFFFF)
            input = 0xE0808000 | ((input << 12) & 0x0F000000) | ((input << 10) & 0x3F0000) | ((input << 8) & 0x3F00);
        else if(input <= 0x10FFFF)
            input = 0xF0808080 | ((input << 06) & 0x07000000) | ((input << 04) & 0x3F0000) | ((input << 2) & 0x3F00) | (input & 0x3F);
        else
            jump error;//not in our range
        @quick_return;
        return llBase64ToString(llIntegerToBase64(input));
    }
    @error;
    return "";
}

General Use

This version will work fine in LSO and Mono but not in LSLEditor.

//===================================================//
//                 Combined Library                  //
//             "Feb  4 2008", "08:35:00"             //
//  Copyright (C) 2004-2008, Strife Onizuka (cc-by)  //
//    http://creativecommons.org/licenses/by/3.0/    //
//===================================================//
//{
 
integer UTF8ToUnicodeInteger(string input)//LSLEditor Unsafe, LSO Safe
{
    integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
    if(result & 0x80000000)//multibyte, continuing to use base64 is impractical because it requires smart shifting.
        return  (   (  0x0000003f &  result       ) |
                    (( 0x00003f00 &  result) >> 2 ) | 
                    (( 0x003f0000 &  result) >> 4 ) | 
                    (( 0x3f000000 & (result = (integer)("0x"+llGetSubString(input,-8,-1)))) >> 6 ) | 
                    (( 0x0000003f &  result) << 24) | 
                    (( 0x00000100 & (result = (integer)("0x"+llDeleteSubString(input = (string)llParseString2List(llEscapeURL(input),(list)"%",[]),-8,-1)))) << 22)
                ) & (  0x7FFFFFFF >> (5 * ((integer)(llLog(~result) / 0.69314718055994530941723212145818) - 25)));
//                    (( 0x00000100 & (result = (integer)("0x"+llDeleteSubString(input,-8,-1)))) << 22)
//                ) & (  0x7FFFFFFF >> (30 - (5 * (llStringLength(input = (string)llParseString2List(llEscapeURL(input),(list)"%",[])) >> 1))));
    return result >> 24;
}
 
string UnicodeIntegerToUTF8(integer input)//LSLEditor Unsafe, LSO Safe
{
    integer bytes = llCeil((llLog(input) / 0.69314718055994530941723212145818));
    string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !(bytes = ((input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5))));
    while (bytes)
        result += "%" + byte2hex((((input >> (6 * (bytes = ~-bytes))) | 0x80) & 0xBF));
    return llUnescapeURL(result);
}
 
string byte2hex(integer x)//LSLEditor Safe, LSO Safe
{//Helper function for use with unicode characters.
    integer y = (x >> 4) & 0xF;
    return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);
}//This function would benefit greatly from the DUP opcode, it would remove 19 bytes.
 
string hexc="0123456789ABCDEF";
 
//} Combined Library

LSLEditor Safe

This version will work in Mono, LSO & LSLEditor. There will be a slight performance hit in LSO as compared to the LSLEditor Unsafe version.

//===================================================//
//                 Combined Library                  //
//             "Feb  4 2008", "08:38:13"             //
//  Copyright (C) 2004-2008, Strife Onizuka (cc-by)  //
//    http://creativecommons.org/licenses/by/3.0/    //
//===================================================//
//{
 
integer UTF8ToUnicodeInteger(string input)//LSLEditor Safe, LSO Safe
{
    integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
    if(result & 0x80000000){//multibyte, continuing to use base64 is impractical because it requires smart shifting.
        integer end = (integer)("0x"+llGetSubString(input = (string)llParseString2List(llEscapeURL(input),(list)"%",[]),-8,-1));
        integer begin = (integer)("0x"+llDeleteSubString(input,-8,-1));
        return  (   (  0x0000003f &  end       ) |
                    (( 0x00003f00 &  end) >> 2 ) | 
                    (( 0x003f0000 &  end) >> 4 ) | 
                    (( 0x3f000000 &  end) >> 6 ) |
                    (( 0x0000003f &  begin) << 24) |
                    (( 0x00000100 &  begin) << 22)
                ) & (0x7FFFFFFF >> (5 * ((integer)(llLog(~result) / 0.69314718055994530941723212145818) - 25)));
    }
    return result >> 24;
}
 
string UnicodeIntegerToUTF8(integer input)//LSLEditor Safe, LSO Safe
{
    integer bytes = llCeil((llLog(input) / 0.69314718055994530941723212145818));
    bytes = (input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5;//adjust
    string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !bytes));
    while (bytes)
        result += "%" + byte2hex((((input >> (6 * (bytes = ~-bytes))) | 0x80) & 0xBF));
    return llUnescapeURL(result);
}
 
string byte2hex(integer x)//LSLEditor Safe, LSO Safe
{//Helper function for use with unicode characters.
    integer y = (x >> 4) & 0xF;
    return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);
}//This function would benefit greatly from the DUP opcode, it would remove 19 bytes.
 
string hexc="0123456789ABCDEF";
 
//} Combined Library