Difference between revisions of "Talk:LlUnescapeURL"
Omei Qunhua (talk | contribs) m (→ASCII7) |
(→ASCII7: It was foosball or seafood. http://nedbatchelder.com/text/hexwords.html) |
||
Line 14: | Line 14: | ||
:It does seem a bit wordy. These functions (both directions) only use UTF-8 byte encoding. That means you can't use UCS-2 (UTF-16) byte pairs or UTF-32 byte quads, or to that matter any other encoding scheme (UTF-EBCDIC, CESU-8). So llEscapeURL will always encode characters via their byte representation in UTF-8 and llUnescapeURL will always decode bytes as if they were UTF-8 (that is all it's trying to say). While this doesn't rule out the possibility of the "%u####" syntax, LL never implemented it. I'm thinking I should maybe write a spec, notes or deepnotes section for this one, as it's easier to describe how it could work then it is to describe what it does (convert input string to byte array and {{Wikipedia|Type_punning|type pun}} the array into a utf-8 backed string, anything that doesn't pun correctly gets converted to a "?"). -- '''[[User:Strife_Onizuka|Strife]]''' <sup><small>([[User talk:Strife_Onizuka|talk]]|[[Special:Contributions/Strife_Onizuka|contribs]])</small></sup> 15:59, 26 January 2014 (PST) | :It does seem a bit wordy. These functions (both directions) only use UTF-8 byte encoding. That means you can't use UCS-2 (UTF-16) byte pairs or UTF-32 byte quads, or to that matter any other encoding scheme (UTF-EBCDIC, CESU-8). So llEscapeURL will always encode characters via their byte representation in UTF-8 and llUnescapeURL will always decode bytes as if they were UTF-8 (that is all it's trying to say). While this doesn't rule out the possibility of the "%u####" syntax, LL never implemented it. I'm thinking I should maybe write a spec, notes or deepnotes section for this one, as it's easier to describe how it could work then it is to describe what it does (convert input string to byte array and {{Wikipedia|Type_punning|type pun}} the array into a utf-8 backed string, anything that doesn't pun correctly gets converted to a "?"). -- '''[[User:Strife_Onizuka|Strife]]''' <sup><small>([[User talk:Strife_Onizuka|talk]]|[[Special:Contributions/Strife_Onizuka|contribs]])</small></sup> 15:59, 26 January 2014 (PST) | ||
::Well if I wasn't already confused, I am now! :) [[User:Omei Qunhua|Omei Qunhua]] 01:55, 27 January 2014 (PST) | ::Well if I wasn't already confused, I am now! :) [[User:Omei Qunhua|Omei Qunhua]] 01:55, 27 January 2014 (PST) | ||
Lets see if this helps (meant to be read, I've not had time to compile it, I must get up in the morning ~_~): | |||
<lsl>string UnescapeURL(string str) { | |||
integer pos = -llStringLength(str); //we will use negative indexes to walk str, set the position at the beginning | |||
string out; //output buffer. | |||
list stack; //stack for hex encoded blocks | |||
while(pos) { | |||
string char = llGetSubString(str, pos, pos); | |||
if(char != "%") { | |||
out += stackFlush(stack); //we need to flush the stack before we can add this char on. | |||
stack = []; | |||
out += char; | |||
++pos; | |||
} else if(pos == -1) {//the string was truncated, you are done. | |||
stack += 0x3f; //"?" char; | |||
pos = 0; | |||
} else if(pos == -2) {//the string was truncated, you are done. | |||
stack += [0x3f, 0x3f]; //"??" chars; | |||
pos = 0; | |||
} else { //omg it's probably hex! | |||
integer test = (integer)("0xF"+llGetSubString(str, pos + 1, pos + 2) + "5BA11"); | |||
if((test & 0xF005BA11) == 0xF005BA11) //if the characters are hex this is true. | |||
stack += (test >> 20) & 0xFF; | |||
else //if the first char is not hex, test will equal 0xF, if it's the second that is bad then test will equal 0xF* | |||
stack += 0x3f;//this may not be the correct thing to do. | |||
pos += 3; | |||
} | |||
} | |||
return out + stackFlush(stack); | |||
} | |||
string stackFlush(list stack) { | |||
string out; | |||
integer g; //bytes found | |||
integer h; //extracted bits | |||
integer i = ~llGetListLength(stack);//position | |||
integer k; //bytes per character | |||
while( ++i ) {//flush the stack. | |||
integer j = llList2Integer(stack, i); | |||
if(k > 0) { | |||
if((0xC0 & j) == 0x80) { | |||
h = (h << 6) | (j & 0x3F); | |||
if(k == ++g) { //should verify that h is the smallest encoding possible, not going to do it. | |||
out += UnicodeIntegerToUTF8(h); | |||
} | |||
} else { | |||
g = k = 0; | |||
out += llGetSubString(":??????", 1, g); | |||
} | |||
} else { | |||
k = llListFindList(llListSort([j, 0x7F, 0xBF, 0xDF, 0xEF, 0xF7, 0xFB, 0xFD]), [j]); | |||
if(!k) | |||
out += UnicodeIntegerToUTF8(j); | |||
else if((k > 1) && (k < 7)) { | |||
h = j & (0x7F >> k); | |||
g = 1; | |||
} else { | |||
g = k = 0; | |||
out += "?"; | |||
} | |||
} | |||
} | |||
if(k != g)//flush the flush corruption buffers | |||
out += llGetSubString(":??????", g + 1, k); | |||
return out; | |||
}</lsl> | |||
-- '''[[User:Strife_Onizuka|Strife]]''' <sup><small>([[User talk:Strife_Onizuka|talk]]|[[Special:Contributions/Strife_Onizuka|contribs]])</small></sup> 20:02, 28 January 2014 (PST) |
Revision as of 20:02, 28 January 2014
url description and url breakdown
I'm not keen on the url parameter description. It doesn't have to be a valid URL, it doesn't even have to be a url at all. I'm also not keen on the sample url, it's just not obvious how it's applicable with the article. -- Strife (talk|contribs) 22:03, 25 January 2014 (PST)
- I see the point about the description. The sample url was copied from llGetHTTPHeader to llEscapeURL and llUnescapeURL because there was some text about the URL path and query string in llEscapeURL and I thought a reference wouldn't do no harm for those that don't know what it is...yet. -- Kireji Haiku (talk|contribs) 07:12, 26 January 2014 (PST)
ASCII7
While looking at the above, could the following be better explained?
"The hexadecimal encoded representation of UTF-8 byte encoding is the only supported means of access to non ASCII7 characters (Unicode characters)."
The word 'encoding' seems to be superfluous. What is the scope of this 'access'? In relation to this function only? Or in LSL in general? Does it perhaps mean that in order to use non ASCII7 characters in any string in LSL you need to write the string using a specific hex representation of those characters and then process the string via llUnescapeURL(). And should there be an example (because by hex representation here I believe we mean %hh and not 0xhhhh) Omei Qunhua 04:37, 26 January 2014 (PST)
- It does seem a bit wordy. These functions (both directions) only use UTF-8 byte encoding. That means you can't use UCS-2 (UTF-16) byte pairs or UTF-32 byte quads, or to that matter any other encoding scheme (UTF-EBCDIC, CESU-8). So llEscapeURL will always encode characters via their byte representation in UTF-8 and llUnescapeURL will always decode bytes as if they were UTF-8 (that is all it's trying to say). While this doesn't rule out the possibility of the "%u####" syntax, LL never implemented it. I'm thinking I should maybe write a spec, notes or deepnotes section for this one, as it's easier to describe how it could work then it is to describe what it does (convert input string to byte array and type pun the array into a utf-8 backed string, anything that doesn't pun correctly gets converted to a "?"). -- Strife (talk|contribs) 15:59, 26 January 2014 (PST)
- Well if I wasn't already confused, I am now! :) Omei Qunhua 01:55, 27 January 2014 (PST)
Lets see if this helps (meant to be read, I've not had time to compile it, I must get up in the morning ~_~):
<lsl>string UnescapeURL(string str) {
integer pos = -llStringLength(str); //we will use negative indexes to walk str, set the position at the beginning string out; //output buffer. list stack; //stack for hex encoded blocks while(pos) { string char = llGetSubString(str, pos, pos); if(char != "%") { out += stackFlush(stack); //we need to flush the stack before we can add this char on. stack = []; out += char; ++pos; } else if(pos == -1) {//the string was truncated, you are done. stack += 0x3f; //"?" char; pos = 0; } else if(pos == -2) {//the string was truncated, you are done. stack += [0x3f, 0x3f]; //"??" chars; pos = 0; } else { //omg it's probably hex! integer test = (integer)("0xF"+llGetSubString(str, pos + 1, pos + 2) + "5BA11"); if((test & 0xF005BA11) == 0xF005BA11) //if the characters are hex this is true. stack += (test >> 20) & 0xFF; else //if the first char is not hex, test will equal 0xF, if it's the second that is bad then test will equal 0xF* stack += 0x3f;//this may not be the correct thing to do. pos += 3; } } return out + stackFlush(stack);
}
string stackFlush(list stack) {
string out; integer g; //bytes found integer h; //extracted bits integer i = ~llGetListLength(stack);//position integer k; //bytes per character while( ++i ) {//flush the stack. integer j = llList2Integer(stack, i); if(k > 0) { if((0xC0 & j) == 0x80) { h = (h << 6) | (j & 0x3F); if(k == ++g) { //should verify that h is the smallest encoding possible, not going to do it. out += UnicodeIntegerToUTF8(h); } } else { g = k = 0; out += llGetSubString(":??????", 1, g); } } else { k = llListFindList(llListSort([j, 0x7F, 0xBF, 0xDF, 0xEF, 0xF7, 0xFB, 0xFD]), [j]); if(!k) out += UnicodeIntegerToUTF8(j); else if((k > 1) && (k < 7)) { h = j & (0x7F >> k); g = 1; } else { g = k = 0; out += "?"; } } } if(k != g)//flush the flush corruption buffers out += llGetSubString(":??????", g + 1, k); return out;
}</lsl> -- Strife (talk|contribs) 20:02, 28 January 2014 (PST)