Difference between revisions of "UTF-8"

From Second Life Wiki
Jump to navigation Jump to search
m
Line 1: Line 1:
{{LSL Header}}{{LSLC|User-Defined_Functions}}
{{LSL Header}}{{LSLC|User-Defined_Functions}}
SL uses UTF-8 for storing and transmitting strings and with these functions you can work with Unicode characters. See: [[Unicode In 5 Minutes]] for a brief introduction to Unicode.
Second Life uses {{Wikipedia|UTF-8|UTF-8}} for storing and transmitting [[String|strings]] and with these functions you can work with {{Wikipedia|Unicode|Unicode}} characters. See: [[Unicode In 5 Minutes]] for a brief introduction to {{Wikipedia|Unicode|Unicode}}.




Line 6: Line 6:


== Standard ==
== Standard ==
This version of UnicodeIntegerToUTF8 complies to the latest standard. LSO on the other hand complies to an earlier standard. The newer standard includes only a subset of the older standard. The extended range of the old standard went unused so this incompleteness is moot.
 
This version of UnicodeIntegerToUTF8 complies to the latest standard. [[LSO]] on the other hand complies to an earlier standard. The newer standard includes only a subset of the older standard. The extended range of the old standard went unused so this incompleteness is moot.


<lsl>string UnicodeIntegerToUTF8(integer input)//Mono Safe, LSLEditor Safe, LSO Incomplete
<lsl>string UnicodeIntegerToUTF8(integer input)//Mono Safe, LSLEditor Safe, LSO Incomplete
Line 34: Line 35:


== General Use==
== General Use==
This version will work fine in LSO and Mono but not in LSLEditor.
 
This version will work fine in [[LSO]] and [[Mono]] but not in LSLEditor.
 
<lsl>//===================================================//
<lsl>//===================================================//
//                Combined Library                  //
//                Combined Library                  //
Line 79: Line 82:


==LSLEditor Safe==
==LSLEditor Safe==
This version will work in Mono, LSO & LSLEditor. There will be a slight performance hit in LSO as compared to the LSLEditor Unsafe version.
 
This version will work in [[Mono]], [[LSO]] & LSLEditor. There will be a slight performance hit in [[LSO]] as compared to the LSLEditor Unsafe version.
 
<lsl>//===================================================//
<lsl>//===================================================//
//                Combined Library                  //
//                Combined Library                  //

Revision as of 11:17, 27 January 2014

Second Life uses "Wikipedia logo"UTF-8 for storing and transmitting strings and with these functions you can work with "Wikipedia logo"Unicode characters. See: Unicode In 5 Minutes for a brief introduction to "Wikipedia logo"Unicode.


These functions are part of the Combined Library written by Strife Onizuka.

Standard

This version of UnicodeIntegerToUTF8 complies to the latest standard. LSO on the other hand complies to an earlier standard. The newer standard includes only a subset of the older standard. The extended range of the old standard went unused so this incompleteness is moot.

<lsl>string UnicodeIntegerToUTF8(integer input)//Mono Safe, LSLEditor Safe, LSO Incomplete {//LSO allows for the older UTF-8 range, this function only supports the new UTF-16 range.

   if(input > 0)
   {
       if(input <= 0x7FF)
       {//instead of a flat if else chain, this redistributes the fork load so that only the 4 byte characters result in 3 forks, all the other paths are 2 forks.
           if(input <= 0x7F){
               input = input << 24;
               jump quick_return;//saves us from the implicit double jump that using an else would cause.
           }
           input = 0xC0800000 | ((input << 18) & 0x1F000000) | ((input << 16) & 0x3F0000);
       }
       else if(input <= 0xFFFF)
           input = 0xE0808000 | ((input << 12) & 0x0F000000) | ((input << 10) & 0x3F0000) | ((input << 8) & 0x3F00);
       else if(input <= 0x10FFFF)
           input = 0xF0808080 | ((input << 06) & 0x07000000) | ((input << 04) & 0x3F0000) | ((input << 2) & 0x3F00) | (input & 0x3F);
       else
           jump error;//not in our range
       @quick_return;
       return llBase64ToString(llIntegerToBase64(input));
   }
   @error;
   return "";

}</lsl>

General Use

This version will work fine in LSO and Mono but not in LSLEditor.

<lsl>//===================================================// // Combined Library // // "Feb 4 2008", "08:35:00" // // Copyright (C) 2004-2008, Strife Onizuka (cc-by) // // http://creativecommons.org/licenses/by/3.0/ // //===================================================// //{

integer UTF8ToUnicodeInteger(string input)//LSLEditor Unsafe, LSO Safe {

   integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
   if(result & 0x80000000)//multibyte, continuing to use base64 is impractical because it requires smart shifting.
       return  (   (  0x0000003f &  result       ) |
                   (( 0x00003f00 &  result) >> 2 ) | 
                   (( 0x003f0000 &  result) >> 4 ) | 
                   (( 0x3f000000 & (result = (integer)("0x"+llGetSubString(input,-8,-1)))) >> 6 ) | 
                   (( 0x0000003f &  result) << 24) | 
                   (( 0x00000100 & (result = (integer)("0x"+llDeleteSubString(input = (string)llParseString2List(llEscapeURL(input),(list)"%",[]),-8,-1)))) << 22)
               ) & (  0x7FFFFFFF >> (5 * ((integer)(llLog(~result) / 0.69314718055994530941723212145818) - 25)));

// (( 0x00000100 & (result = (integer)("0x"+llDeleteSubString(input,-8,-1)))) << 22) // ) & ( 0x7FFFFFFF >> (30 - (5 * (llStringLength(input = (string)llParseString2List(llEscapeURL(input),(list)"%",[])) >> 1))));

   return result >> 24;

}

string UnicodeIntegerToUTF8(integer input)//LSLEditor Unsafe, LSO Safe {

   integer bytes = llCeil((llLog(input) / 0.69314718055994530941723212145818));
   string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !(bytes = ((input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5))));
   while (bytes)
       result += "%" + byte2hex((((input >> (6 * (bytes = ~-bytes))) | 0x80) & 0xBF));
   return llUnescapeURL(result);

}

string byte2hex(integer x)//LSLEditor Safe, LSO Safe {//Helper function for use with unicode characters.

   integer y = (x >> 4) & 0xF;
   return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);

}//This function would benefit greatly from the DUP opcode, it would remove 19 bytes.

string hexc="0123456789ABCDEF";

//} Combined Library</lsl>

LSLEditor Safe

This version will work in Mono, LSO & LSLEditor. There will be a slight performance hit in LSO as compared to the LSLEditor Unsafe version.

<lsl>//===================================================// // Combined Library // // "Feb 4 2008", "08:38:13" // // Copyright (C) 2004-2008, Strife Onizuka (cc-by) // // http://creativecommons.org/licenses/by/3.0/ // //===================================================// //{

integer UTF8ToUnicodeInteger(string input)//LSLEditor Safe, LSO Safe {

   integer result = llBase64ToInteger(llStringToBase64(input = llGetSubString(input,0,0)));
   if(result & 0x80000000){//multibyte, continuing to use base64 is impractical because it requires smart shifting.
       integer end = (integer)("0x"+llGetSubString(input = (string)llParseString2List(llEscapeURL(input),(list)"%",[]),-8,-1));
       integer begin = (integer)("0x"+llDeleteSubString(input,-8,-1));
       return  (   (  0x0000003f &  end       ) |
                   (( 0x00003f00 &  end) >> 2 ) | 
                   (( 0x003f0000 &  end) >> 4 ) | 
                   (( 0x3f000000 &  end) >> 6 ) |
                   (( 0x0000003f &  begin) << 24) |
                   (( 0x00000100 &  begin) << 22)
               ) & (0x7FFFFFFF >> (5 * ((integer)(llLog(~result) / 0.69314718055994530941723212145818) - 25)));
   }
   return result >> 24;

}

string UnicodeIntegerToUTF8(integer input)//LSLEditor Safe, LSO Safe {

   integer bytes = llCeil((llLog(input) / 0.69314718055994530941723212145818));
   bytes = (input >= 0x80) * (bytes + ~(((1 << bytes) - input) > 0)) / 5;//adjust
   string result = "%" + byte2hex((input >> (6 * bytes)) | ((0x3F80 >> bytes) << !bytes));
   while (bytes)
       result += "%" + byte2hex((((input >> (6 * (bytes = ~-bytes))) | 0x80) & 0xBF));
   return llUnescapeURL(result);

}

string byte2hex(integer x)//LSLEditor Safe, LSO Safe {//Helper function for use with unicode characters.

   integer y = (x >> 4) & 0xF;
   return llGetSubString(hexc, y, y) + llGetSubString(hexc, x & 0xF, x & 0xF);

}//This function would benefit greatly from the DUP opcode, it would remove 19 bytes.

string hexc="0123456789ABCDEF";

//} Combined Library</lsl>