tika PackageParser 不适用于目录



我正在编写一个类来递归地从zip文件内部提取文件,并将它们生成到Kafka队列进行进一步处理。我的目的是能够从多个级别的zip中提取文件。下面的代码是我实现的tika ContainerExtractor来做到这一点。

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.Stack;
import org.apache.commons.lang.StringUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.extractor.ContainerExtractor;
import org.apache.tika.extractor.EmbeddedResourceHandler;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.pkg.PackageParser;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
public class UberContainerExtractor implements ContainerExtractor {
  /**
   * 
   */
  private static final long serialVersionUID = -6636138154366178135L;
  // statically populate SUPPORTED_TYPES
  static {
    Set<MediaType> supportedTypes = new HashSet<MediaType>();
    ParseContext context = new ParseContext();
    supportedTypes.addAll(new PackageParser().getSupportedTypes(context));
    SUPPORTED_TYPES = Collections.unmodifiableSet(supportedTypes);
  }
  /**
   * A stack that maintains the parent filenames for the recursion
   */
  Stack<String> parentFileNames = new Stack<String>();
  /**
   * The default tika parser
   */
  private final Parser parser;
  /**
   * Default tika detector
   */
  private final Detector detector;
  /**
   * The supported container types into which we can recurse
   */
  public final static Set<MediaType> SUPPORTED_TYPES;
  /**
   * The number of documents recursively extracted from the container and its
   * children containers if present
   */
  int extracted;
  public UberContainerExtractor() {
    this(TikaConfig.getDefaultConfig());
  }
  public UberContainerExtractor(TikaConfig config) {
    this(new DefaultDetector(config.getMimeRepository()));
  }
  public UberContainerExtractor(Detector detector) {
    this.parser = new AutoDetectParser(new PackageParser());
    this.detector = detector;
  }
  public boolean isSupported(TikaInputStream input) throws IOException {
    MediaType type = detector.detect(input, new Metadata());
    return SUPPORTED_TYPES.contains(type);
  }
  @Override
  public void extract(TikaInputStream stream, ContainerExtractor recurseExtractor, EmbeddedResourceHandler handler)
      throws IOException, TikaException {
    ParseContext context = new ParseContext();
    context.set(Parser.class, new RecursiveParser(recurseExtractor, handler));
    try {
      Metadata metadata = new Metadata();
      parser.parse(stream, new DefaultHandler(), metadata, context);
    } catch (SAXException e) {
      throw new TikaException("Unexpected SAX exception", e);
    }
  }
  private class RecursiveParser extends AbstractParser {
    /**
     * 
     */
    private static final long serialVersionUID = -7260171956667273262L;
    private final ContainerExtractor extractor;
    private final EmbeddedResourceHandler handler;
    private RecursiveParser(ContainerExtractor extractor, EmbeddedResourceHandler handler) {
      this.extractor = extractor;
      this.handler = handler;
    }
    public Set<MediaType> getSupportedTypes(ParseContext context) {
      return parser.getSupportedTypes(context);
    }
    public void parse(InputStream stream, ContentHandler ignored, Metadata metadata, ParseContext context)
        throws IOException, SAXException, TikaException {
      TemporaryResources tmp = new TemporaryResources();
      try {
        TikaInputStream tis = TikaInputStream.get(stream, tmp);
        // Figure out what we have to process
        String filename = metadata.get(Metadata.RESOURCE_NAME_KEY);
        MediaType type = detector.detect(tis, metadata);
        if (extractor == null) {
          // do nothing
        } else {
          // Use a temporary file to process the stream
          File file = tis.getFile();
          System.out.println("file is directory = " + file.isDirectory());
          // Recurse and extract if the filetype is supported
          if (SUPPORTED_TYPES.contains(type)) {
            System.out.println("encountered a supported file:" + filename);
            parentFileNames.push(filename);
            extractor.extract(tis, extractor, handler);
            parentFileNames.pop();
          } else { // produce the file
            List<String> parentFilenamesList = new ArrayList<String>(parentFileNames);
            parentFilenamesList.add(filename);
            String originalFilepath = StringUtils.join(parentFilenamesList, "/");
            System.out.println("producing " + filename + " with originalFilepath:" + originalFilepath
                + " to kafka queue");
            ++extracted;
          }
        }
      } finally {
        tmp.dispose();
      }
    }
  }
  public int getExtracted() {
    return extracted;
  }
  public static void main(String[] args) throws IOException, TikaException {
    String filename = "/Users/rohit/Data/cd.zip";
    File file = new File(filename);
    TikaInputStream stream = TikaInputStream.get(file);
    ContainerExtractor recursiveExtractor = new UberContainerExtractor();
    EmbeddedResourceHandler resourceHandler = new EmbeddedResourceHandler() {
      @Override
      public void handle(String filename, MediaType mediaType, InputStream stream) {
        // do nothing
      }
    };
    recursiveExtractor.extract(stream, recursiveExtractor, resourceHandler);
    stream.close();
    System.out.println("extracted " + ((UberContainerExtractor) recursiveExtractor).getExtracted() + " files");
  }
}

它适用于多个级别的zip,只要zips中的文件是扁平结构。 例如。光盘.zip - c.txt - d.txt

如果 zip 中的文件存在于目录中,则代码不起作用。 例如。阿卜.zip -血型/ - 一.txt - b.txt

调试时,我在包解析器中遇到了以下代码片段

try {
  ArchiveEntry entry = ais.getNextEntry();
  while (entry != null) {
    if (!entry.isDirectory()) {
        parseEntry(ais, entry, extractor, xhtml);
    }
    entry = ais.getNextEntry();
  }
} finally {
  ais.close();
}

我试图注释掉 if 条件,但它不起作用。有理由评论吗?有什么办法可以解决这个问题吗?

我正在使用 tika 版本 1.6

以相反的顺序处理您的问题:

有理由评论吗?

zip 文件中的条目是目录或文件。如果是文件,则它们包括它们来自的目录的名称。因此,Tika 不需要对目录做任何事情,它需要做的就是在嵌入文件出现时处理它们

如果 zip 中的文件存在于目录中,则代码不起作用。 例如 AB.zip - AB/- A.txt - B.txt

那你好像做错了什么。Tika 的递归和包解析器可以很好地处理包含文件夹的 zip!

为了证明这一点,请从这样的 zip 文件开始:

$ unzip -l ../tt.zip 
Archive:  ../tt.zip
  Length      Date    Time    Name
---------  ---------- -----   ----
        0  2015-02-03 16:42   t/
        0  2015-02-03 16:42   t/t2/
        0  2015-02-03 16:42   t/t2/t3/
   164404  2015-02-03 16:42   t/t2/t3/test.jpg
---------                     -------
   164404                     4 files

现在,让我们使用 Tika 应用程序的 -z 提取标志,它会导致 Tika 提取文件的所有嵌入内容。像这样运行,我们得到

$ java -jar tika-app-1.7.jar -z ../tt.zip 
Extracting 't/t2/t3/test.jpg' (image/jpeg) to ./t/t2/t3/test.jpg

然后列出结果目录,我们看到

$ find . -type f
./t/t2/t3/Test.jpg

我看不出你的代码有什么问题,但可悲的是,我们已经证明问题就在那里,而不是 Tika......您最好查看 Tika 提供的各种递归示例,例如 Tika App 工具和递归解析器包装器,然后根据这些示例重新编写代码以使其简单

相关内容

  • 没有找到相关文章

最新更新