URL编/解码的ECMAScript语言实现

1.1、URL编码

虽然ECMAScript提供了encodeURI和encodeURIComponent两个函数可以进行URL编码，不过，这两个函数都会把!、~、'、(、)视为安全字符，也就是这5个字符不编码，这与java.net.URLEncoder的编码结果不一致，所以，尽量采用下面的我们自己实现的URL编码。

URL编码的ECMAScript语言实现如下：

//ECMAScript中的字符串采用UTF16编码的
//我们先获得每个字符的codePoint，然后再转换为UTF8编码，最后把每个字节转成ASCII编码
function codePoint2UTF8(str) {
    var strLength = str.length;
    var utf8Bytes = [];
    for (var i = 0; i < strLength; i++) {
        var code = str.charCodeAt(i);
        if (code > 0x0000 && code <= 0x007F) {
            // 单字节，这里并不考虑0x0000，因为它是空字节
            // U+00000000 – U+0000007F     0xxxxxxx
            utf8Bytes.push(str.charAt(i));
        } else if (code >= 0x0080 && code <= 0x07FF) {
            // 双字节
            // U+00000080 – U+000007FF     110xxxxx 10xxxxxx
            // 110xxxxx
            var byte1 = 0xC0 | ((code >> 6) & 0x1F);
            // 10xxxxxx
            var byte2 = 0x80 | (code & 0x3F);
            utf8Bytes.push(
                String.fromCharCode(byte1),
                String.fromCharCode(byte2)
            );
        } else if (code >= 0x0800 && code <= 0xFFFF) {
            // 三字节
            // U+00000800 – U+0000FFFF     1110xxxx 10xxxxxx 10xxxxxx
            // 1110xxxx
            var byte1 = 0xE0 | ((code >> 12) & 0x0F);
            // 10xxxxxx
            var byte2 = 0x80 | ((code >> 6) & 0x3F);
            // 10xxxxxx
            var byte3 = 0x80 | (code & 0x3F);
            utf8Bytes.push(
                String.fromCharCode(byte1),
                String.fromCharCode(byte2),
                String.fromCharCode(byte3)
            );
        } else if (code >= 0x00010000 && code <= 0x001FFFFF) {
            // 四字节
            // U+00010000 – U+001FFFFF     11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        } else if (code >= 0x00200000 && code <= 0x03FFFFFF) {
            // 五字节
            // U+00200000 – U+03FFFFFF     111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        } else /** if (code >= 0x04000000 && code <= 0x7FFFFFFF)*/ {
            // 六字节
            // U+04000000 – U+7FFFFFFF     1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        }
    }
    return utf8Bytes.join('');
}

var base16EncodeTable = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'];

function url_encode(str) {
    var utf8Str = codePoint2UTF8(str);
    var utf8StrLength = utf8Str.length;
    var result = [];
    for (var i = 0; i < utf8StrLength; i++) {
        var ascii = utf8Str.charCodeAt(i);
        //man ascii查看ascii对应表
        //字符   十进制
        //'0' => 48   |   '9' => 57
        //'a' => 97   |   'z' => 122
        //'A' => 65   |   'Z' => 90
        //'-' => 45   |   '_' => 95
        //'*' => 42   |   '.' => 46
        //' ' => 32
        if ((ascii >= 48 && ascii <= 57)
            || (ascii >= 65 && ascii <= 90)
            || (ascii >= 97 && ascii <= 122)) {
            result.push(utf8Str.charAt(i))
        } else if (ascii == 45 || ascii == 95 || ascii == 42 || ascii == 46) {
            result.push(utf8Str.charAt(i))
        } else if (ascii == 32) {
            result.push('+')
        } else {
            //向右移动4bit，获得高4bit
            var highByte = ascii >> 4;
            //与0x0f做位与运算，获得低4bit
            var lowByte = ascii & 0x0F;
            result.push('%', base16EncodeTable[highByte], base16EncodeTable[lowByte]);
        }
    }
    return result.join('');
}

1.2、URL解码

URL解码的ECMAScript语言实现如下：

function ascii2UTF16(str) {
    var strLength = str.length;
    var utf16Chars = [];
    for (var i = 0; i < strLength; i++) {
        var ascii = str.charCodeAt(i);
        // 对第一个字节进行判断，看看最高1位是否为1
        if (((ascii >> 7) & 0xFF) == 0x0) {
            // 单字节
            // 0xxxxxxx
            utf16Chars.push(str.charAt(i));
        } else if (((ascii >> 5) & 0xFF) == 0x6) {
            // 双字节
            // 110xxxxx 10xxxxxx
            var code2 = str.charCodeAt(++i);
            var byte1 = (ascii & 0x1F) << 6;
            var byte2 = code2 & 0x3F;
            var utf16 = byte1 | byte2;
            utf16Chars.push(Sting.fromCharCode(utf16));
        } else if (((ascii >> 4) & 0xFF) == 0xE) {
            // 三字节
            // 1110xxxx 10xxxxxx 10xxxxxx
            var code2 = str.charCodeAt(++i);
            var code3 = str.charCodeAt(++i);
            var byte1 = (ascii << 4) | ((code2 >> 2) & 0x0F);
            var byte2 = ((code2 & 0x03) << 6) | (code3 & 0x3F);
            var utf16 = ((byte1 & 0x00FF) << 8) | byte2
            utf16Chars.push(String.fromCharCode(utf16));
        } else if (((ascii >> 3) & 0xFF) == 0x1E) {
            // 四字节
            // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
        } else if (((ascii >> 2) & 0xFF) == 0x3E) {
            // 五字节
            // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        } else /** if (((ascii >> 1) & 0xFF) == 0x7E)*/ {
            // 六字节
            // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
        }
    }
    return utf16Chars.join('');
}

//把16进制字符转换成10进制表示的数字
//man ascii查看ascii对应表
//字符   十进制
//'0' => 48
//'9' => 57
//'a' => 97
//'f' => 102
//'A' => 65
//'F' => 70
function hex2dec(asciiCode) {
    if (48 <= asciiCode && asciiCode <= 57) {
        return asciiCode - 48;
    } else if (97 <= asciiCode && asciiCode <= 102) {
        return asciiCode - 97 + 10;
    } else if (65 <= asciiCode && asciiCode <= 70) {
        return asciiCode - 65 + 10;
    } else {
        return 0;
    }
}

function url_decode(str) {
    var strLength = str.length;
    var asciiChars = [];
    for (var i = 0; i < strLength; i++) {
        var c = str.charCodeAt(i);
        //man ascii查看ascii对应表
        //字符   十进制
        //'%' => 37
        if (c == 37) {
            var x = str.charCodeAt(++i);
            var y = str.charCodeAt(++i);
            //16进制数字转换为10进制数字的过程
            var ascii = hex2dec(x) * 16 + hex2dec(y);
            asciiChars.push(String.fromCharCode(ascii));
        } else if (c == 43) { //把+转成空格
            asciiChars.push(32);
        } else {
            asciiChars.push(str.charAt(i));
        }
    }
    return ascii2UTF16(asciiChars.join(''));
}