在打开文件之前如何知道编码方案?



JAVA, 当我尝试使用 BufferedReader 打开和读取文件时,我收到一条错误消息,指出我使用了错误的编码。因此,系统调用了一个异常,我的编码器无法读取该文件。 在这种情况下,我如何知道对文件使用了哪种编码。 当然,如果文件是用"utf-8"写入的,那么就不可能用"euc-kr"编码读取文件。我的问题是我想在打开文件之前获取字符集信息,以便为该文件选择正确的编码方案。有人帮我吗?

这是我的代码

package lecture06;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.IOException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Scanner;
public class FindExample01 {
/**
* Initialized in : getInput()
* Used at : findPattern()
*/
private static String pattern;
/**
* Initialized in initApplication
* @param args
*/
private static BufferedWriter wBuffer;
public static void main(String[] args) {
initApplication();
Path dir = Paths.get(getInput());
System.out.println("root = " + dir.toString());
System.out.println("pattern = " + pattern);
searchDirectory(dir.toString());
try {
wBuffer.flush();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
wBuffer.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void initApplication()
{
try {
wBuffer = Files.newBufferedWriter(Paths.get("Index.txt"), StandardCharsets.UTF_8);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static String getInput()
{
Scanner sc = new Scanner(System.in);
String dir = null;
for(;;)
{
System.out.println("Root Directory: ");
dir = sc.next();
if (Files.exists(Paths.get(dir), LinkOption.NOFOLLOW_LINKS)) break;
}
for(;;)
{
System.out.println("Find what ?");
pattern = sc.next();
if (pattern.length() > 2) 
{
sc.close();
return dir;
}
}
}
private static void searchDirectory(String root)
{
File fiRoot = new File(root);
File[] files = fiRoot.listFiles();
for (File file : files)
{
if (file.isDirectory()) searchDirectory(file.getAbsolutePath());
else findPattern(file.toPath());
}
}
private static void findPattern(Path path)
{
try {
BufferedReader rBuffer = Files.newBufferedReader(path, StandardCharsets.UTF_8 );
int count = 1;
String line;
while ((line = rBuffer.readLine()) != null)
{
int idx;
while ((idx = line.indexOf(pattern)) != -1)
writeIndex(path.toString(), count, idx);
count++;
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
private static void writeIndex(String path, int count, int idx)
{
try {
wBuffer.write(path + " : " + count + " : " + idx + " : " + pattern);
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
try {
wBuffer.newLine();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

试试 juniversalchardet,它是一个编码检测器库。它有一个可以检测到的流行编码列表。为此,您不需要读取整个文件,只需读取第一个字节

byte[] buf = new byte[4096];
UniversalDetector detector = new UniversalDetector(null);
int nread;
while ((nread = fileInputStream.read(buf)) > 0 && !detector.isDone()) {
detector.handleData(buf, 0, nread);
}
detector.dataEnd();
String encoding = detector.getDetectedCharset();

最新更新