如何使os.move和os.rename使用包含Unicode字符的文件名,仅使用库存Lua 5.3?
filename = "C:\τέστ.txt"
os.rename(filename, filename .. "1")
这不执行任何操作。
我也试过这个,但仍然不起作用:
filename = "C:\τέστ.txt"
t = {}
for p, c in utf8.codes(filename) do
t[#t+1] = c
end
filename = "\" .. table.concat(t, "\")
os.rename(filename, filename .. "1")
有什么想法吗?提前非常感谢您的帮助!:)
正如其他人所指出的,您将无法使用Lua的库存版本做太多事情,因为它使用的是CreateFileA
而不是该函数的unicode版本(CreateFileW
)。如果可以加载外部模块,则可以使用 winapi,因为它支持检索"短"文件名:
local ok, winapi = pcall(require, "winapi")
if ok then
winapi.set_encoding(winapi.CP_UTF8)
local shortpath = winapi.short_path(filepath)
if shortpath ~= filepath then
-- have the short path
end
end
此代码应该适用于所有平台(因为它将无法在不需要此转换的 macOS 和 Linux 上加载 winapi)。如果短文件名不可用,转换仍可能失败,并且可以使用fsutil 8dot3name set DRIVE: 0
命令在 Windows 中配置(每个驱动器)。
如果通过转换同时运行源文件名和目标文件名(并删除目标文件,因为它可能由short_file
调用创建),则重命名将起作用。
正如lhf指出的那样,你的代码在MacOS上运行良好。
您所需要的只是对Windows的更正。
下面的代码是用纯Lua编写的;它重新定义了标准的os
/io
函数,以便它们在Windows中使用UTF-8文件名。
请注意,您的 Windows 区域设置必须是希腊语,并且您的所有文件名必须仅包含 Windows 希腊语代码页中的符号。 在纯Lua的Windows上,您无法打开名称中包含任意UTF-8符号的文件。
if (os.getenv"os" or ""):match"^Windows" then
local map_unicode_to_1253 = {
[0x20AC] = 0x80,
[0x201A] = 0x82,
[0x0192] = 0x83,
[0x201E] = 0x84,
[0x2026] = 0x85,
[0x2020] = 0x86,
[0x2021] = 0x87,
[0x2030] = 0x89,
[0x2039] = 0x8B,
[0x2018] = 0x91,
[0x2019] = 0x92,
[0x201C] = 0x93,
[0x201D] = 0x94,
[0x2022] = 0x95,
[0x2013] = 0x96,
[0x2014] = 0x97,
[0x2122] = 0x99,
[0x203A] = 0x9B,
[0x00A0] = 0xA0,
[0x0385] = 0xA1,
[0x0386] = 0xA2,
[0x00A3] = 0xA3,
[0x00A4] = 0xA4,
[0x00A5] = 0xA5,
[0x00A6] = 0xA6,
[0x00A7] = 0xA7,
[0x00A8] = 0xA8,
[0x00A9] = 0xA9,
[0x00AB] = 0xAB,
[0x00AC] = 0xAC,
[0x00AD] = 0xAD,
[0x00AE] = 0xAE,
[0x2015] = 0xAF,
[0x00B0] = 0xB0,
[0x00B1] = 0xB1,
[0x00B2] = 0xB2,
[0x00B3] = 0xB3,
[0x0384] = 0xB4,
[0x00B5] = 0xB5,
[0x00B6] = 0xB6,
[0x00B7] = 0xB7,
[0x0388] = 0xB8,
[0x0389] = 0xB9,
[0x038A] = 0xBA,
[0x00BB] = 0xBB,
[0x038C] = 0xBC,
[0x00BD] = 0xBD,
[0x038E] = 0xBE,
[0x038F] = 0xBF,
[0x0390] = 0xC0,
[0x0391] = 0xC1,
[0x0392] = 0xC2,
[0x0393] = 0xC3,
[0x0394] = 0xC4,
[0x0395] = 0xC5,
[0x0396] = 0xC6,
[0x0397] = 0xC7,
[0x0398] = 0xC8,
[0x0399] = 0xC9,
[0x039A] = 0xCA,
[0x039B] = 0xCB,
[0x039C] = 0xCC,
[0x039D] = 0xCD,
[0x039E] = 0xCE,
[0x039F] = 0xCF,
[0x03A0] = 0xD0,
[0x03A1] = 0xD1,
[0x03A3] = 0xD3,
[0x03A4] = 0xD4,
[0x03A5] = 0xD5,
[0x03A6] = 0xD6,
[0x03A7] = 0xD7,
[0x03A8] = 0xD8,
[0x03A9] = 0xD9,
[0x03AA] = 0xDA,
[0x03AB] = 0xDB,
[0x03AC] = 0xDC,
[0x03AD] = 0xDD,
[0x03AE] = 0xDE,
[0x03AF] = 0xDF,
[0x03B0] = 0xE0,
[0x03B1] = 0xE1,
[0x03B2] = 0xE2,
[0x03B3] = 0xE3,
[0x03B4] = 0xE4,
[0x03B5] = 0xE5,
[0x03B6] = 0xE6,
[0x03B7] = 0xE7,
[0x03B8] = 0xE8,
[0x03B9] = 0xE9,
[0x03BA] = 0xEA,
[0x03BB] = 0xEB,
[0x03BC] = 0xEC,
[0x03BD] = 0xED,
[0x03BE] = 0xEE,
[0x03BF] = 0xEF,
[0x03C0] = 0xF0,
[0x03C1] = 0xF1,
[0x03C2] = 0xF2,
[0x03C3] = 0xF3,
[0x03C4] = 0xF4,
[0x03C5] = 0xF5,
[0x03C6] = 0xF6,
[0x03C7] = 0xF7,
[0x03C8] = 0xF8,
[0x03C9] = 0xF9,
[0x03CA] = 0xFA,
[0x03CB] = 0xFB,
[0x03CC] = 0xFC,
[0x03CD] = 0xFD,
[0x03CE] = 0xFE,
}
local char, byte, table_insert, table_concat = string.char, string.byte, table.insert, table.concat
local function utf8_to_unicode(utf8str, pos)
-- pos = starting byte position inside input string (default 1)
pos = pos or 1
local code, size = byte(utf8str, pos), 1
if code >= 0xC0 and code < 0xFE then
local mask = 64
code = code - 128
repeat
local next_byte = byte(utf8str, pos + size) or 0
if next_byte >= 0x80 and next_byte < 0xC0 then
code, size = (code - mask - 2) * 64 + next_byte, size + 1
else
code, size = byte(utf8str, pos), 1
end
mask = mask * 32
until code < mask
end
-- returns code, number of bytes in this utf8 char
return code, size
end
local function utf8_to_1253(utf8str)
local pos, result_1253 = 1, {}
while pos <= #utf8str do
local code, size = utf8_to_unicode(utf8str, pos)
pos = pos + size
code = code < 128 and code or map_unicode_to_1253[code] or byte('?')
table_insert(result_1253, char(code))
end
return table_concat(result_1253)
end
local orig_os_rename = os.rename
function os.rename(old, new)
return orig_os_rename(utf8_to_1253(old), utf8_to_1253(new))
end
local orig_os_remove = os.remove
function os.remove(filename)
return orig_os_remove(utf8_to_1253(filename))
end
local orig_os_execute = os.execute
function os.execute(command)
if command then
command = utf8_to_1253(command)
end
return orig_os_execute(command)
end
local orig_io_open = io.open
function io.open(filename, ...)
return orig_io_open(utf8_to_1253(filename), ...)
end
local orig_io_popen = io.popen
function io.popen(prog, ...)
return orig_io_popen(utf8_to_1253(prog), ...)
end
local orig_io_lines = io.lines
function io.lines(filename, ...)
if filename then
filename = utf8_to_1253(filename)
end
return orig_io_lines(filename, ...)
end
end
更新:
如何确定 Windows 代码页:
local function get_windows_ansi_codepage()
local pipe = assert(io.popen[[reg query HKLMSYSTEMCurrentControlSetControlNlsCodePage /v ACP]])
local codepage = pipe:read"*a":match"%sACP%s+REG_SZ%s+(.-)%s*$"
pipe:close()
return codepage -- returns string "1253"
end
在标准实现中,os.rename
调用 C 函数rename
该函数又转到 Windows 上的CreateFileA
。此函数只应处理 ANSI 字符串,并在内部对字符串从 ANSI 到 Unicode 执行转换。
这将使用区域设置指定的系统 ANSI 代码页。如果您的系统设置为希腊语,则它可能使用定义希腊字符的代码页 1253,但其他代码页中的字符不可用。
我不确定,但是如果系统代码页设置为 65001 (UTF-8),则可能允许您使用 UTF-8。还有一个名为 AppLocale 的应用程序,它只能为特定应用程序设置它。
如果可以使用外部库,则似乎有一些库可以在系统 API 调用中使用宽字符。
我还尝试在Windows中未修改的Lua 5.3中使用非ASCII文件名,但没有奏效。我认为它需要Lua的修改版本。我的理解是,Lua对文件名,命令和环境变量使用基本的C函数,但Windows使用UTF-16编码,并要求您对非ASCII文件名,命令和环境变量使用宽字符串(这意味着Windows上的UTF-16)函数。
我编译并尝试了Lua的修改版本,它可以很好地处理非ASCII文件名:lua-u8w。它使用处理文件等的各种函数的宽字符串版本,并从 UTF-8 转换为 UTF-16 并返回,以便您可以在 Lua 中使用 UTF-8,而 UTF-16 用于处理 Windows 操作系统。
Egor Skriptunoff的代码完全解决了我的问题。我稍微修改了他的代码,以便可以插入其他映射表,并根据语言环境使用正确的映射。
谢谢大家的帮助! :)
if (os.getenv"os" or ""):match"^Windows" then
local char, byte, table_insert, table_concat = string.char, string.byte, table.insert, table.concat
-- TABLES OF CODEPAGES
local cp1253 = { -- GREEK
[0x20AC] = 0x80, -- EURO SIGN
[0x201A] = 0x82, -- SINGLE LOW-9 QUOTATION MARK
[0x0192] = 0x83, -- LATIN SMALL LETTER F WITH HOOK
[0x201E] = 0x84, -- DOUBLE LOW-9 QUOTATION MARK
[0x2026] = 0x85, -- HORIZONTAL ELLIPSIS
[0x2020] = 0x86, -- DAGGER
[0x2021] = 0x87, -- DOUBLE DAGGER
[0x2030] = 0x89, -- PER MILLE SIGN
[0x2039] = 0x8B, -- SINGLE LEFT-POINTING ANGLE QUOTATION MARK
[0x2018] = 0x91, -- LEFT SINGLE QUOTATION MARK
[0x2019] = 0x92, -- RIGHT SINGLE QUOTATION MARK
[0x201C] = 0x93, -- LEFT DOUBLE QUOTATION MARK
[0x201D] = 0x94, -- RIGHT DOUBLE QUOTATION MARK
[0x2022] = 0x95, -- BULLET
[0x2013] = 0x96, -- EN DASH
[0x2014] = 0x97, -- EM DASH
[0x2122] = 0x99, -- TRADE MARK SIGN
[0x203A] = 0x9B, -- SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
[0x00A0] = 0xA0, -- NO-BREAK SPACE
[0x0385] = 0xA1, -- GREEK DIALYTIKA TONOS
[0x0386] = 0xA2, -- GREEK CAPITAL LETTER ALPHA WITH TONOS
[0x00A3] = 0xA3, -- POUND SIGN
[0x00A4] = 0xA4, -- CURRENCY SIGN
[0x00A5] = 0xA5, -- YEN SIGN
[0x00A6] = 0xA6, -- BROKEN BAR
[0x00A7] = 0xA7, -- SECTION SIGN
[0x00A8] = 0xA8, -- DIAERESIS
[0x00A9] = 0xA9, -- COPYRIGHT SIGN
[0x00AB] = 0xAB, -- LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
[0x00AC] = 0xAC, -- NOT SIGN
[0x00AD] = 0xAD, -- SOFT HYPHEN
[0x00AE] = 0xAE, -- REGISTERED SIGN
[0x2015] = 0xAF, -- HORIZONTAL BAR
[0x00B0] = 0xB0, -- DEGREE SIGN
[0x00B1] = 0xB1, -- PLUS-MINUS SIGN
[0x00B2] = 0xB2, -- SUPERSCRIPT TWO
[0x00B3] = 0xB3, -- SUPERSCRIPT THREE
[0x0384] = 0xB4, -- GREEK TONOS
[0x00B5] = 0xB5, -- MICRO SIGN
[0x00B6] = 0xB6, -- PILCROW SIGN
[0x00B7] = 0xB7, -- MIDDLE DOT
[0x0388] = 0xB8, -- GREEK CAPITAL LETTER EPSILON WITH TONOS
[0x0389] = 0xB9, -- GREEK CAPITAL LETTER ETA WITH TONOS
[0x038A] = 0xBA, -- GREEK CAPITAL LETTER IOTA WITH TONOS
[0x00BB] = 0xBB, -- RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
[0x038C] = 0xBC, -- GREEK CAPITAL LETTER OMICRON WITH TONOS
[0x00BD] = 0xBD, -- VULGAR FRACTION ONE HALF
[0x038E] = 0xBE, -- GREEK CAPITAL LETTER UPSILON WITH TONOS
[0x038F] = 0xBF, -- GREEK CAPITAL LETTER OMEGA WITH TONOS
[0x0390] = 0xC0, -- GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
[0x0391] = 0xC1, -- GREEK CAPITAL LETTER ALPHA
[0x0392] = 0xC2, -- GREEK CAPITAL LETTER BETA
[0x0393] = 0xC3, -- GREEK CAPITAL LETTER GAMMA
[0x0394] = 0xC4, -- GREEK CAPITAL LETTER DELTA
[0x0395] = 0xC5, -- GREEK CAPITAL LETTER EPSILON
[0x0396] = 0xC6, -- GREEK CAPITAL LETTER ZETA
[0x0397] = 0xC7, -- GREEK CAPITAL LETTER ETA
[0x0398] = 0xC8, -- GREEK CAPITAL LETTER THETA
[0x0399] = 0xC9, -- GREEK CAPITAL LETTER IOTA
[0x039A] = 0xCA, -- GREEK CAPITAL LETTER KAPPA
[0x039B] = 0xCB, -- GREEK CAPITAL LETTER LAMDA
[0x039C] = 0xCC, -- GREEK CAPITAL LETTER MU
[0x039D] = 0xCD, -- GREEK CAPITAL LETTER NU
[0x039E] = 0xCE, -- GREEK CAPITAL LETTER XI
[0x039F] = 0xCF, -- GREEK CAPITAL LETTER OMICRON
[0x03A0] = 0xD0, -- GREEK CAPITAL LETTER PI
[0x03A1] = 0xD1, -- GREEK CAPITAL LETTER RHO
[0x03A3] = 0xD3, -- GREEK CAPITAL LETTER SIGMA
[0x03A4] = 0xD4, -- GREEK CAPITAL LETTER TAU
[0x03A5] = 0xD5, -- GREEK CAPITAL LETTER UPSILON
[0x03A6] = 0xD6, -- GREEK CAPITAL LETTER PHI
[0x03A7] = 0xD7, -- GREEK CAPITAL LETTER CHI
[0x03A8] = 0xD8, -- GREEK CAPITAL LETTER PSI
[0x03A9] = 0xD9, -- GREEK CAPITAL LETTER OMEGA
[0x03AA] = 0xDA, -- GREEK CAPITAL LETTER IOTA WITH DIALYTIKA
[0x03AB] = 0xDB, -- GREEK CAPITAL LETTER UPSILON WITH DIALYTIKA
[0x03AC] = 0xDC, -- GREEK SMALL LETTER ALPHA WITH TONOS
[0x03AD] = 0xDD, -- GREEK SMALL LETTER EPSILON WITH TONOS
[0x03AE] = 0xDE, -- GREEK SMALL LETTER ETA WITH TONOS
[0x03AF] = 0xDF, -- GREEK SMALL LETTER IOTA WITH TONOS
[0x03B0] = 0xE0, -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
[0x03B1] = 0xE1, -- GREEK SMALL LETTER ALPHA
[0x03B2] = 0xE2, -- GREEK SMALL LETTER BETA
[0x03B3] = 0xE3, -- GREEK SMALL LETTER GAMMA
[0x03B4] = 0xE4, -- GREEK SMALL LETTER DELTA
[0x03B5] = 0xE5, -- GREEK SMALL LETTER EPSILON
[0x03B6] = 0xE6, -- GREEK SMALL LETTER ZETA
[0x03B7] = 0xE7, -- GREEK SMALL LETTER ETA
[0x03B8] = 0xE8, -- GREEK SMALL LETTER THETA
[0x03B9] = 0xE9, -- GREEK SMALL LETTER IOTA
[0x03BA] = 0xEA, -- GREEK SMALL LETTER KAPPA
[0x03BB] = 0xEB, -- GREEK SMALL LETTER LAMDA
[0x03BC] = 0xEC, -- GREEK SMALL LETTER MU
[0x03BD] = 0xED, -- GREEK SMALL LETTER NU
[0x03BE] = 0xEE, -- GREEK SMALL LETTER XI
[0x03BF] = 0xEF, -- GREEK SMALL LETTER OMICRON
[0x03C0] = 0xF0, -- GREEK SMALL LETTER PI
[0x03C1] = 0xF1, -- GREEK SMALL LETTER RHO
[0x03C2] = 0xF2, -- GREEK SMALL LETTER FINAL SIGMA
[0x03C3] = 0xF3, -- GREEK SMALL LETTER SIGMA
[0x03C4] = 0xF4, -- GREEK SMALL LETTER TAU
[0x03C5] = 0xF5, -- GREEK SMALL LETTER UPSILON
[0x03C6] = 0xF6, -- GREEK SMALL LETTER PHI
[0x03C7] = 0xF7, -- GREEK SMALL LETTER CHI
[0x03C8] = 0xF8, -- GREEK SMALL LETTER PSI
[0x03C9] = 0xF9, -- GREEK SMALL LETTER OMEGA
[0x03CA] = 0xFA, -- GREEK SMALL LETTER IOTA WITH DIALYTIKA
[0x03CB] = 0xFB, -- GREEK SMALL LETTER UPSILON WITH DIALYTIKA
[0x03CC] = 0xFC, -- GREEK SMALL LETTER OMICRON WITH TONOS
[0x03CD] = 0xFD, -- GREEK SMALL LETTER UPSILON WITH TONOS
[0x03CE] = 0xFE, -- GREEK SMALL LETTER OMEGA WITH TONOS
}
local locale = tonumber(string.match(os.setlocale(), "(%d+)$"))
local CODEPAGE
-- Use appropriate locale
if locale == 1253 then -- GREEK
CODEPAGE = cp1253
elseif locale == 1254 then -- TURKISH
CODEPAGE = cp1254
elseif locale == 1255 then -- HEBREW
CODEPAGE = cp1255
-- etc
end
local function utf8_to_unicode(utf8str, pos)
-- pos = starting byte position inside input string (default 1)
pos = pos or 1
local code, size = byte(utf8str, pos), 1
if code >= 0xC0 and code < 0xFE then
local mask = 64
code = code - 128
repeat
local next_byte = byte(utf8str, pos + size) or 0
if next_byte >= 0x80 and next_byte < 0xC0 then
code, size = (code - mask - 2) * 64 + next_byte, size + 1
else
code, size = byte(utf8str, pos), 1
end
mask = mask * 32
until code < mask
end
-- returns code, number of bytes in this utf8 char
return code, size
end
local function utf8_to_codepage(utf8str)
local pos, result_codepage = 1, {}
while pos <= #utf8str do
local code, size = utf8_to_unicode(utf8str, pos)
pos = pos + size
code = code < 128 and code or CODEPAGE[code] or byte('?')
table_insert(result_codepage, char(code))
end
return table_concat(result_codepage)
end
local orig_os_rename = os.rename
function os.rename(old, new)
return orig_os_rename(utf8_to_codepage(old), utf8_to_codepage(new))
end
local orig_os_remove = os.remove
function os.remove(filename)
return orig_os_remove(utf8_to_codepage(filename))
end
local orig_os_execute = os.execute
function os.execute(command)
if command then
command = utf8_to_codepage(command)
end
return orig_os_execute(command)
end
local orig_io_open = io.open
function io.open(filename, ...)
return orig_io_open(utf8_to_codepage(filename), ...)
end
local orig_io_popen = io.popen
function io.popen(prog, ...)
return orig_io_popen(utf8_to_codepage(prog), ...)
end
local orig_io_lines = io.lines
function io.lines(filename, ...)
if filename then
filename = utf8_to_codepage(filename)
end
return orig_io_lines(filename, ...)
end
end