我正在编写一个类来递归地从zip文件内部提取文件,并将它们生成到Kafka队列进行进一步处理。我的目的是能够从多个级别的zip中提取文件。下面的代码是我实现的tika ContainerExtractor来做到这一点。
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pkg.PackageParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class UberContainerExtractor implements ContainerExtractor {
/**
*
*/
private static final long serialVersionUID = -6636138154366178135L;
// statically populate SUPPORTED_TYPES
static {
Set<MediaType> supportedTypes = new HashSet<MediaType>();
ParseContext context = new ParseContext();
supportedTypes.addAll(new PackageParser().getSupportedTypes(context));
SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
}
/**
* A stack that maintains the parent filenames for the recursion
*/
Stack<String> parentFileNames = new Stack<String>();
/**
* The default tika parser
*/
private final Parser parser;
/**
* Default tika detector
*/
private final Detector detector;
/**
* The supported container types into which we can recurse
*/
public final static Set<MediaType> SUPPORTED_TYPES;
/**
* The number of documents recursively extracted from the container and its
* children containers if present
*/
int extracted;
public UberContainerExtractor() {
this(TikaConfig.getDefaultConfig());
}
public UberContainerExtractor(TikaConfig config) {
this(new DefaultDetector(config.getMimeRepository()));
}
public UberContainerExtractor(Detector detector) {
this.parser = new AutoDetectParser(new PackageParser());
this.detector = detector;
}
public boolean isSupported(TikaInputStream input) throws IOException {
MediaType type = detector.detect(input, new Metadata());
return SUPPORTED_TYPES.contains(type);
}
@Override
public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler)
throws IOException, TikaException {
ParseContext context = new ParseContext();
context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
try {
Metadata metadata = new Metadata();
parser.parse(stream, new DefaultHandler(), metadata, context);
} catch (SAXException e) {
throw new TikaException("Unexpected SAX exception", e);
}
}
private class RecursiveParser extends AbstractParser {
/**
*
*/
private static final long serialVersionUID = -7260171956667273262L;
private final ContainerExtractor extractor;
private final EmbeddedResourceHandler handler;
private RecursiveParser(ContainerExtractor extractor, EmbeddedResourceHandler handler) {
this.extractor = extractor;
this.handler = handler;
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return parser.getSupportedTypes(context);
}
public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
TemporaryResources tmp = new TemporaryResources();
try {
TikaInputStream tis = TikaInputStream.get(stream, tmp);
// Figure out what we have to process
String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
MediaType type = detector.detect(tis, metadata);
if (extractor == null) {
// do nothing
} else {
// Use a temporary file to process the stream
File file = tis.getFile();
System.out.println("file is directory = " + file.isDirectory());
// Recurse and extract if the filetype is supported
if (SUPPORTED_TYPES.contains(type)) {
System.out.println("encountered a supported file:" + filename);
parentFileNames.push(filename);
extractor.extract(tis, extractor, handler);
parentFileNames.pop();
} else { // produce the file
List<String> parentFilenamesList = new ArrayList<String>(parentFileNames);
parentFilenamesList.add(filename);
String originalFilepath = StringUtils.join(parentFilenamesList, "/");
System.out.println("producing " + filename + " with originalFilepath:" + originalFilepath
+ " to kafka queue");
++extracted;
}
}
} finally {
tmp.dispose();
}
}
}
public int getExtracted() {
return extracted;
}
public static void main(String[] args) throws IOException, TikaException {
String filename = "/Users/rohit/Data/cd.zip";
File file = new File(filename);
TikaInputStream stream = TikaInputStream.get(file);
ContainerExtractor recursiveExtractor = new UberContainerExtractor();
EmbeddedResourceHandler resourceHandler = new EmbeddedResourceHandler() {
@Override
public void handle(String filename, MediaType mediaType, InputStream stream) {
// do nothing
}
};
recursiveExtractor.extract(stream, recursiveExtractor, resourceHandler);
stream.close();
System.out.println("extracted " + ((UberContainerExtractor) recursiveExtractor).getExtracted() + " files");
}
}
它适用于多个级别的zip,只要zips中的文件是扁平结构。 例如。光盘.zip - c.txt - d.txt
如果 zip 中的文件存在于目录中,则代码不起作用。 例如。阿卜.zip -血型/ - 一.txt - b.txt
调试时,我在包解析器中遇到了以下代码片段
try {
ArchiveEntry entry = ais.getNextEntry();
while (entry != null) {
if (!entry.isDirectory()) {
parseEntry(ais, entry, extractor, xhtml);
}
entry = ais.getNextEntry();
}
} finally {
ais.close();
}
我试图注释掉 if 条件,但它不起作用。有理由评论吗?有什么办法可以解决这个问题吗?
我正在使用 tika 版本 1.6
以相反的顺序处理您的问题:
有理由评论吗?
zip 文件中的条目是目录或文件。如果是文件,则它们包括它们来自的目录的名称。因此,Tika 不需要对目录做任何事情,它需要做的就是在嵌入文件出现时处理它们
。如果 zip 中的文件存在于目录中,则代码不起作用。 例如 AB.zip - AB/- A.txt - B.txt
那你好像做错了什么。Tika 的递归和包解析器可以很好地处理包含文件夹的 zip!
为了证明这一点,请从这样的 zip 文件开始:
$ unzip -l ../tt.zip
Archive: ../tt.zip
Length Date Time Name
--------- ---------- ----- ----
0 2015-02-03 16:42 t/
0 2015-02-03 16:42 t/t2/
0 2015-02-03 16:42 t/t2/t3/
164404 2015-02-03 16:42 t/t2/t3/test.jpg
--------- -------
164404 4 files
现在,让我们使用 Tika 应用程序的 -z
提取标志,它会导致 Tika 提取文件的所有嵌入内容。像这样运行,我们得到
$ java -jar tika-app-1.7.jar -z ../tt.zip
Extracting 't/t2/t3/test.jpg' (image/jpeg) to ./t/t2/t3/test.jpg
然后列出结果目录,我们看到
$ find . -type f
./t/t2/t3/Test.jpg
我看不出你的代码有什么问题,但可悲的是,我们已经证明问题就在那里,而不是 Tika......您最好查看 Tika 提供的各种递归示例,例如 Tika App 工具和递归解析器包装器,然后根据这些示例重新编写代码以使其简单