将包含 UTF8 字符串的变量转换为包含 latin1 字符串的变量 - 用于 Javascript 中的浏览器



Bash 替代(shell 设置为 UTF8):

输入:

in.json

$ file -I in.json
in.json: text/plain; charset=utf-8

{"it-it":"Città"}

Bash 命令我需要 JS 替代方案:

$ iconv -f utf8 -t latin1 in.json > out.json

out.json

$ file -I in.json
out.json: text/plain; charset=iso-8859-1

{"it-it":"Citt?"}

Javascript 在浏览器中从输入类型="file"读取为 base64 时看到的 in.json(尽管内容类型和脚本编码设置为 utf8):

{"it-it":"Città"}

Javascript 在浏览器中看到的 out.json :

{"it-it":"Città"}

问题 - 我如何以最原生的 Javascript 方式使大多数现代浏览器转换此 utf8 字符串

({"it-it":"Città "} as latin1 and {"it-it":"Città"} as utf8) 

到拉丁语 1 字符串?

我更喜欢原生解决方案,或者最坏的情况 JQuery,请尽量不要用 npm + 节点依赖地狱来解决它。

PS:我只需要支持最现代的浏览器,这是针对管理员专用页面的。

下面我创建了一个带有CittÃ版本的数组iso-8859-1然后使用TextDecoder对其进行解码。

因此,如果您可以获得 JSON 的二进制版本,这应该能够为您转换。

//CittÃ
var latinSource = new Uint8Array([67, 105, 116, 116, 195]);
var tc = new TextDecoder("iso-8859-1");
console.log(tc.decode(latinSource));

对我来说

,"新文本解码器("iso-8859-1")"不起作用...

1.

var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]);
var tc = new TextDecoder("iso-8859-1");
console.log(tc.decode(latinSource)); //return windows-1252 string

我明白了,结果,它不是拉丁 1 字符串,因为它包含字符"€"。

阿拉伯数字。

//windows-1252
console.log('new TextDecoder("iso-8859-1")', new TextDecoder("iso-8859-1"));
// ----> 
//new TextDecoder("iso-8859-1") {
//  "encoding": "windows-1252",
//  "fatal": false,
//  "ignoreBOM": false,
//  "decode": function decode() { [native code] }
//}

  1. 编码-解码拉丁语-1 的工作方法:

//Decode Latin1-string (iso-8859-1 encoded string) -> into Uint8Array
function Latin1ToUint8Array(iso_8859_1){
    var uInt8Arr = new Uint8Array(iso_8859_1.length);
    for(var i=0; i<iso_8859_1.length; i++){
        uInt8Arr[i] = iso_8859_1.charCodeAt(i);
    }
    return uInt8Arr;
}
//encode Uint8Array -> into iso-8859-1 encoded string (latin1-string)
function Uint8ToLatin1Str(Uint8Arr){
    var iso_8859_1_string = '';
    for(var i=0; i<Uint8Arr.length; i++){iso_8859_1_string+= String.fromCharCode(Uint8Arr[i]);}
    return iso_8859_1_string;
}
var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]);
console.log( Uint8ToLatin1Str(latinSource) ); //valid latin1-string (iso-8859-1)

  1. 最后,Windows-1252 转换:

function Windows1252EncodeDecode(
    cp1252  //string (to encode into bytes), or Uint8Array (to decode into string)
){
    var replaceCharCodesForLatin1 = {
        //_______________________________________________________________________
        //|"windows-1252"|      iso-8859-1      |       //Unicode               |
        //|'character'   |      charcode,       |       //charcode(commented),  |
        //|______________|______________________|_______________________________|
            '€'         :       128,                    //8364, 
            '‚'         :       130,                    //8218, 
            'ƒ'         :       131,                    //402, 
            '„'         :       132,                    //8222,
            '…'         :       133,                    //8230, 
            '†'         :       134,                    //8224, 
            '‡'         :       135,                    //8225, 
            'ˆ'         :       136,                    //710, 
            '‰'         :       137,                    //8240, 
            'Š'         :       138,                    //352, 
            '‹'         :       139,                    //8249, 
            'Œ'         :       140,                    //338, 
            'Ž'         :       142,                    //381, 
            '‘'         :       145,                    //8216, 
            '’'         :       146,                    //8217, 
            '“'         :       147,                    //8220, 
            '”'         :       148,                    //8221, 
            '•'         :       149,                    //8226, 
            '–'         :       150,                    //8211, 
            '—'         :       151,                    //8212, 
            '˜'         :       152,                    //732, 
            '™'         :       153,                    //8482, 
            'š'         :       154,                    //353, 
            '›'         :       155,                    //8250, 
            'œ'         :       156,                    //339, 
            'ž'         :       158,                    //382, 
            'Ÿ'         :       159,                    //376
    };
    if(typeof cp1252 === 'string'){ //if that was been string to encode to bytes
        var resultUint8 = new Uint8Array(cp1252.length);
        for(var i = 0; i<cp1252.length; i++){
            var charCode = cp1252[i].charCodeAt(0);
            resultUint8[i] = ((charCode>256) ? replaceCharCodesForLatin1[cp1252[i]] : charCode);
        }
        return resultUint8; //return Uint8Array
    }else if(cp1252 instanceof Uint8Array){ //else if that was been Uint8Array to decode to string
        var resultString = "";
        for(var i = 0; i<cp1252.length; i++){
            var charCode = (Object.keys(replaceCharCodesForLatin1).find(key => replaceCharCodesForLatin1[key] === cp1252[i]));
            charCode = (typeof charCode === 'undefined') ? String.fromCharCode(cp1252[i]) : charCode;
            resultString += charCode;
        }
        return resultString;    //return Uint8Array
    }
}
var latinSource = new Uint8Array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255]);
var windows1252 = new TextDecoder("iso-8859-1").decode(latinSource); //windows-1252 string on output
console.log('new TextDecoder("iso-8859-1").decode(latinSource)', (new TextDecoder("iso-8859-1").decode(latinSource)))
var bytesBack = Windows1252EncodeDecode(windows1252);
console.log('bytesBack', bytesBack.toString());
var Windows1252StringBack = Windows1252EncodeDecode(bytesBack)
console.log('string back', Windows1252StringBack);
console.log('Compare with TextDecoder', (Windows1252StringBack === windows1252 ));

  1. 拉丁语-1 函数的修改,从 3:

function isLatin1String(str){return (str.match(/[^u0000-u00FF]/) === null);} //check is string "iso-8859-1"-encoded or not (true/false)
//Decode Latin1 or utf-8 string -> into Uint8Array
function StringToUint8Array(str){
    
    if(!isLatin1String(str)){
        return new TextEncoder("utf-8").encode(str); //encode to bytes as utf-8
    }
    //else, as ASCII-compatible latin1-string
    var uInt8Arr = new Uint8Array(str.length);
    
    for(var i=0; i<str.length; i++){
        uInt8Arr[i] = str.charCodeAt(i);
    }
    
    return uInt8Arr;
}
//encode Uint8Array -> to latin1-string
function Uint8ToStr(Uint8Arr){
    var iso_8859_1_string = '';
    for(var i=0; i<Uint8Arr.length; i++){iso_8859_1_string+= String.fromCharCode(Uint8Arr[i]);}
    return iso_8859_1_string;
}
function latin1ToUtf8(latin1str){
  return new TextDecoder("utf-8").decode(StringToUint8Array(latin1str));
}
console.log('StringToUint8Array("CittÃ")', StringToUint8Array("CittÃ")); //Latin1
console.log('StringToUint8Array("Città€")', StringToUint8Array("Città€")); //utf-8
console.log('Uint8ToStr(StringToUint8Array("CittÃ"))', Uint8ToStr(StringToUint8Array("CittÃ"))); //latin1
console.log('Uint8ToStr(StringToUint8Array("Città"))', Uint8ToStr(StringToUint8Array("Città€"))); //utf-8
console.log('latin1ToUtf8(Uint8ToStr(StringToUint8Array("Città€")))', latin1ToUtf8(Uint8ToStr(StringToUint8Array("Città€")))); //utf-8

最新更新