OCAML:如何解码Unicode-escape字符串



给定str如下:

let str = "#include \u003Cunordered_map\u003E\u000D\u000A"

如何将Unicode-escape字符串解码为Unicode字符串或OCAML中的Case ASCII字符串?

在Python中,我可以轻松地做

str.decode("unicode-escape")

如果您的嵌入式逃生序列总是要编码ASCII字符,那么您可以说,您可以找到它们并用解码的等效替换它们:

let decode s =
    let re = Str.regexp "\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
    let s1 n = String.make 1 (Char.chr n) in
    let subst = function
    | Str.Delim u -> s1 (int_of_string ("0x" ^ String.sub u 2 4))
    | Str.Text t -> t
    in
    String.concat "" (List.map subst (Str.full_split re s))

这适用于您的示例:

val decode : string -> string = <fun>
# decode "#include \u003Cunordered_map\u003E\u000D\u000A";;
- : string = "#include <unordered_map>rn"

的确,Python具有内置的支持来解码这些序列。

update

通过转换为UTF-8来支持所有四位数六位逃逸序列"uXXXX",您可以使用此代码:

let utf8encode s =
    let prefs = [| 0x0; 0xc0; 0xe0 |] in
    let s1 n = String.make 1 (Char.chr n) in
    let rec ienc k sofar resid =
        let bct = if k = 0 then 7 else 6 - k in
        if resid < 1 lsl bct then
            (s1 (prefs.(k) + resid)) ^ sofar
        else
            ienc (k + 1) (s1 (0x80 + resid mod 64) ^ sofar) (resid / 64)
    in
    ienc 0 "" (int_of_string ("0x" ^ s))
let decode2 s =
    let re = Str.regexp "\\u[0-9a-fA-F][0-9a-fA-F][0-9a-fA-F][0-9a-fA-F]" in
    let subst = function
    | Str.Delim u -> utf8encode (String.sub u 2 4)
    | Str.Text t -> t
    in
    String.concat "" (List.map subst (Str.full_split re s))

它也适用于您的示例,还有其他一些示例:

val utf8encode : string -> string = <fun>
val decode2 : string -> string = <fun>
# decode2 "#include \u003Cunordered_map\u003E\u000D\u000A";;
- : string = "#include <unordered_map>rn"
# print_endline (decode2 "\u00A2");;
¢
- : unit = ()
# print_endline (decode2 "\u20AC");;
€
- : unit = ()

最新更新