javascript - Converting punycode with dash character to Unicode -
i need convert punycode niato-otabd
nñiñatoñ
.
i found a text converter in javascript other day, punycode conversion doesn't work if there's dash in middle.
any suggestion fix "dash" issue?
i took time create punycode below. it based on c code in rfc 3492. use domain names have remove/add xn--
from/to input/output to/from decode/encode.
the utf16-class
necessary convert javascripts internal character representation unicode , back.
there toascii
, tounicode
functions make easier convert between puny-coded idn , ascii.
//javascript punycode converter derived example in rfc3492. //this implementation created some@domain.name , released public domain var punycode = new function punycode() { // object converts , puny-code used in idn // // punycode.toascii ( domain ) // // returns puny coded representation of "domain". // converts part of domain name // has non ascii characters. i.e. dosent matter if // call domain in ascii. // // punycode.tounicode (domain) // // converts puny-coded domain name unicode. // converts puny-coded parts of domain name. // i.e. dosent matter if call on string // has been converted unicode. // // this.utf16 = { // utf16-class necessary convert javascripts internal character representation unicode , back. decode:function(input){ var output = [], i=0, len=input.length,value,extra; while (i < len) { value = input.charcodeat(i++); if ((value & 0xf800) === 0xd800) { = input.charcodeat(i++); if ( ((value & 0xfc00) !== 0xd800) || ((extra & 0xfc00) !== 0xdc00) ) { throw new rangeerror("utf-16(decode): illegal utf-16 sequence"); } value = ((value & 0x3ff) << 10) + (extra & 0x3ff) + 0x10000; } output.push(value); } return output; }, encode:function(input){ var output = [], i=0, len=input.length,value; while (i < len) { value = input[i++]; if ( (value & 0xf800) === 0xd800 ) { throw new rangeerror("utf-16(encode): illegal utf-16 value"); } if (value > 0xffff) { value -= 0x10000; output.push(string.fromcharcode(((value >>>10) & 0x3ff) | 0xd800)); value = 0xdc00 | (value & 0x3ff); } output.push(string.fromcharcode(value)); } return output.join(""); } } //default parameters var initial_n = 0x80; var initial_bias = 72; var delimiter = "\x2d"; var base = 36; var damp = 700; var tmin=1; var tmax=26; var skew=38; var maxint = 0x7fffffff; // decode_digit(cp) returns numeric value of basic code // point (for use in representing integers) in range 0 // base-1, or base if cp not represent value. function decode_digit(cp) { return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : cp - 97 < 26 ? cp - 97 : base; } // encode_digit(d,flag) returns basic code point value // (when used representing integers) d, needs in // range 0 base-1. lowercase form used unless flag // nonzero, in case uppercase form used. behavior // undefined if flag nonzero , digit d has no uppercase form. function encode_digit(d, flag) { return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); // 0..25 map ascii a..z or a..z // 26..35 map ascii 0..9 } //** bias adaptation function ** function adapt(delta, numpoints, firsttime ) { var k; delta = firsttime ? math.floor(delta / damp) : (delta >> 1); delta += math.floor(delta / numpoints); (k = 0; delta > (((base - tmin) * tmax) >> 1); k += base) { delta = math.floor(delta / ( base - tmin )); } return math.floor(k + (base - tmin + 1) * delta / (delta + skew)); } // encode_basic(bcp,flag) forces basic code point lowercase if flag zero, // uppercase if flag nonzero, , returns resulting code point. // code point unchanged if caseless. // behavior undefined if bcp not basic code point. function encode_basic(bcp, flag) { bcp -= (bcp - 97 < 26) << 5; return bcp + ((!flag && (bcp - 65 < 26)) << 5); } // main decode this.decode=function(input,preservecase) { // dont use utf16 var output=[]; var case_flags=[]; var input_length = input.length; var n, out, i, bias, basic, j, ic, oldi, w, k, digit, t, len; // initialize state: n = initial_n; = 0; bias = initial_bias; // handle basic code points: let basic number of input code // points before last delimiter, or 0 if there none, // copy first basic code points output. basic = input.lastindexof(delimiter); if (basic < 0) basic = 0; (j = 0; j < basic; ++j) { if(preservecase) case_flags[output.length] = ( input.charcodeat(j) -65 < 26); if ( input.charcodeat(j) >= 0x80) { throw new rangeerror("illegal input >= 0x80"); } output.push( input.charcodeat(j) ); } // main decoding loop: start after last delimiter if // basic code points copied; start @ beginning otherwise. (ic = basic > 0 ? basic + 1 : 0; ic < input_length; ) { // ic index of next character consumed, // decode generalized variable-length integer delta, // gets added i. overflow checking easier // if increase go, subtract off starting // value @ end obtain delta. (oldi = i, w = 1, k = base; ; k += base) { if (ic >= input_length) { throw rangeerror ("punycode_bad_input(1)"); } digit = decode_digit(input.charcodeat(ic++)); if (digit >= base) { throw rangeerror("punycode_bad_input(2)"); } if (digit > math.floor((maxint - i) / w)) { throw rangeerror ("punycode_overflow(1)"); } += digit * w; t = k <= bias ? tmin : k >= bias + tmax ? tmax : k - bias; if (digit < t) { break; } if (w > math.floor(maxint / (base - t))) { throw rangeerror("punycode_overflow(2)"); } w *= (base - t); } out = output.length + 1; bias = adapt(i - oldi, out, oldi === 0); // supposed wrap around out 0, // incrementing n each time, we'll fix now: if ( math.floor(i / out) > maxint - n) { throw rangeerror("punycode_overflow(3)"); } n += math.floor( / out ) ; %= out; // insert n @ position of output: // case of last character determines uppercase flag: if (preservecase) { case_flags.splice(i, 0, input.charcodeat(ic -1) -65 < 26);} output.splice(i, 0, n); i++; } if (preservecase) { (i = 0, len = output.length; < len; i++) { if (case_flags[i]) { output[i] = (string.fromcharcode(output[i]).touppercase()).charcodeat(0); } } } return this.utf16.encode(output); }; //** main encode function ** this.encode = function (input,preservecase) { //** bias adaptation function ** var n, delta, h, b, bias, j, m, q, k, t, ijv, case_flags; if (preservecase) { // preserve case, step1 of 2: list of unaltered string case_flags = this.utf16.decode(input); } // converts input in utf-16 unicode input = this.utf16.decode(input.tolowercase()); var input_length = input.length; // cache length if (preservecase) { // preserve case, step2 of 2: modify list true/false (j=0; j < input_length; j++) { case_flags[j] = input[j] != case_flags[j]; } } var output=[]; // initialize state: n = initial_n; delta = 0; bias = initial_bias; // handle basic code points: (j = 0; j < input_length; ++j) { if ( input[j] < 0x80) { output.push( string.fromcharcode( case_flags ? encode_basic(input[j], case_flags[j]) : input[j] ) ); } } h = b = output.length; // h number of code points have been handled, b // number of basic code points if (b > 0) output.push(delimiter); // main encoding loop: // while (h < input_length) { // non-basic code points < n have been // handled already. find next larger one: (m = maxint, j = 0; j < input_length; ++j) { ijv = input[j]; if (ijv >= n && ijv < m) m = ijv; } // increase delta enough advance decoder's // <n,i> state <m,0>, guard against overflow: if (m - n > math.floor((maxint - delta) / (h + 1))) { throw rangeerror("punycode_overflow (1)"); } delta += (m - n) * (h + 1); n = m; (j = 0; j < input_length; ++j) { ijv = input[j]; if (ijv < n ) { if (++delta > maxint) return error("punycode_overflow(2)"); } if (ijv == n) { // represent delta generalized variable-length integer: (q = delta, k = base; ; k += base) { t = k <= bias ? tmin : k >= bias + tmax ? tmax : k - bias; if (q < t) break; output.push( string.fromcharcode(encode_digit(t + (q - t) % (base - t), 0)) ); q = math.floor( (q - t) / (base - t) ); } output.push( string.fromcharcode(encode_digit(q, preservecase && case_flags[j] ? 1:0 ))); bias = adapt(delta, h + 1, h == b); delta = 0; ++h; } } ++delta, ++n; } return output.join(""); } this.toascii = function ( domain ) { var domain_array = domain.split("."); var out = []; (var i=0; < domain_array.length; ++i) { var s = domain_array[i]; out.push( s.match(/[^a-za-z0-9-]/) ? "xn--" + punycode.encode(s) : s ); } return out.join("."); } this.tounicode = function ( domain ) { var domain_array = domain.split("."); var out = []; (var i=0; < domain_array.length; ++i) { var s = domain_array[i]; out.push( s.match(/^xn--/) ? punycode.decode(s.slice(4)) : s ); } return out.join("."); } }();
update licence:
rfc3492:
disclaimer , license
regarding entire document or portion of (including pseudocode , c code), author makes no guarantees , not responsible damage resulting use. author grants irrevocable permission use, modify, , distribute in way not diminish rights of else use, modify, , distribute it, provided redistributed derivative works not contain misleading author or version information. derivative works need not licensed under similar terms.
i put work in punycode , utf16 in public domain. nice email telling me in project use it.
Comments
Post a Comment