Apache Tika和Apache Solr通过Java API集成



我正在尝试集成apache tika和apache solr,以便我可以索引我的解析数据。我正在使用Solr版本4.3.1和Tika版本为2.11.6。

我遵循的代码就像:

import java.io.File;
import java.io.FileInputStream;

import java.io.IOException;

import java.io.InputStream;

import java.util.UUID;
import org.apache.solr.client.solrj.SolrServer;

import org.apache.solr.client.solrj.impl.HttpSolrServer;

import org.apache.solr.common.SolrInputDocument;

import org.apache.tika.exception.TikaException;

import org.apache.tika.metadata.DublinCore;

import org.apache.tika.metadata.Metadata;

import org.apache.tika.mime.MimeTypes;

import org.apache.tika.parser.AutoDetectParser;

import org.apache.tika.parser.ParseContext;

import org.apache.tika.parser.Parser;

import org.apache.tika.sax.BodyContentHandler;

import org.xml.sax.ContentHandler;

import org.xml.sax.SAXException;

public class Main {
    private static SolrServer solr;
    public static void main(String[] args) throws IOException, SAXException, TikaException {
        try {
            solr = new HttpSolrServer("http://localhost:8983/solr/#/"); //create solr connection
            //solr.deleteByQuery( "*:*" ); //delete everything in the index; good for testing
            //location of source documents
            //later this will be switched to a database
            String path = "C:\content\";
            String file_html = path + "mobydick.htm";
            String file_txt = path + "/home/ben/abc.warc";

            String file_pdf = path + "callofthewild.pdf";
            processDocument(file_html);
            processDocument(file_txt);
            processDocument(file_pdf);
            solr.commit(); //after all docs are added, commit to the index
            //now you can search at http://localhost:8983/solr/browse
        }
        catch  (Exception ex) {
            System.out.println(ex.getMessage());
        }           
    }
    private static void processDocument(String pathfilename)  {
        try {
            InputStream input = new FileInputStream(new File(pathfilename));
            //use Apache Tika to convert documents in different formats to plain text
            ContentHandler textHandler = new BodyContentHandler(10*1024*1024);
            Metadata meta = new Metadata();
            Parser parser = new AutoDetectParser();
 //handles documents in different formats:
            ParseContext context = new ParseContext();   
            parser.parse(input, textHandler, meta, context); //convert to plain text
            //collect metadata and content from Tika and other sources
            //document id must be unique, use guide
        UUID guid = java.util.UUID.randomUUID();
        String docid = guid.toString();
            //Dublin Core metadata (partial set)
            String doctitle = meta.get(DublinCore.TITLE);
            String doccreator = meta.get(DublinCore.CREATOR); 
            //other metadata
            String docurl = pathfilename; //document url
            //content
            String doccontent = textHandler.toString();
            //call to index
            indexDocument(docid, doctitle, doccreator, docurl, doccontent);
        }
        catch  (Exception ex) {
            System.out.println(ex.getMessage());
        }
    }   
    private static void indexDocument(String docid, String doctitle, String doccreator, String docurl, String doccontent)  {
        try {
            SolrInputDocument doc = new SolrInputDocument();
            doc.addField("id", docid);
            //map metadata fields to default schema
            //location: pathsolr-4.7.2examplesolrcollection1confschema.xml
            //Dublin Core
            //thought: schema could be modified to use Dublin Core
            doc.addField("title", doctitle);
            doc.addField("author", doccreator);
            //other metadata

            doc.addField("url", docurl);
            //content (and text)
            //per schema, the content field is not indexed by default, used for returning and highlighting document content
            //the schema "copyField" command automatically copies this to the "text" field which is indexed
            doc.addField("content", doccontent);
            //indexing
            //when a field is indexed, like "text", Solr will handle tokenization, stemming, removal of stopwords etc, per the schema defn
            //add to index
            solr.add(doc);  
        } 
        catch (Exception ex) {
            System.out.println(ex.getMessage());
}     } }

错误我得到了

    Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/http/NoHttpResponseException
        at Main.main(Main.java:28)

Caused by: java.lang.ClassNotFoundException: org.apache.http.NoHttpResponseException

    at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
    at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:331)
    at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
    ... 1 more

这似乎与tika无关,因此我专注于Solr。

特别了解您如何包括Solr库及其依赖关系。如果您通过Maven将它们拉开,则应该工作。但是,如果您手动这样做,也许您错过了一两个。

具体来说,错误消息是关于与Apache Commons HTTP(客户端)库一起分发的缺失类。也许在依赖关系或classPath中缺少它。

相关内容

  • 没有找到相关文章

最新更新