从数据库问题索引二进制文件(无错误)



我正在尝试索引存储在数据库(mysql)中的二进制文件,但没有成功。我有一个配置如下的solr:索尔文件结构

+solr
   +bookledger(core0)
      -conf
      +lib(all necessary libraries)
        +contrib
        +dist
   +data
      +bookledger
        -index
        -spellchecker
      +ktimatologio
        -index
        -spellchecker
    +ktimatologio(core1)
      -conf
      +lib(all necessary libraries)
        +contrib
        +dist

如您所见,该配置涉及多核solr设置。现在,在 bookledger(core0) 上,我已经成功地索引了二进制文件(存储在数据库中)。当我进行完全导入时,在第二个核心中,我没有看到错误!然后,当我尝试查询二进制内容时,输出如下所示:[B@660b1b14。我在这里错过了什么?

提前谢谢你,

汤姆

solr.xml 文件:

<?xml version="1.0" encoding="UTF-8" ?>
<solr persistent="false">  
  <cores adminPath="/admin/cores">
    <core name="ktimatologio" instanceDir="ktimatologio" dataDir="../data/ktimatologio"/>
    <core name="bookledger" instanceDir="bookledger" dataDir="../data/bookledger"/>
  </cores>
</solr>

solrconfig.xml 文件:

<?xml version="1.0" encoding="UTF-8" ?>
<config>
  <abortOnConfigurationError>${solr.abortOnConfigurationError:true}</abortOnConfigurationError>

  <luceneMatchVersion>LUCENE_36</luceneMatchVersion>  
  <lib dir="lib/dist/" regex="apache-solr-cell-d.*.jar" />
  <lib dir="lib/dist/" regex="apache-solr-clustering-d.*.jar" />
  <lib dir="lib/dist/" regex="apache-solr-dataimporthandler-d.*.jar" />
  <lib dir="lib/dist/" regex="apache-solr-langid-d.*.jar" />
  <lib dir="lib/dist/" regex="apache-solr-velocity-d.*.jar" />
  <lib dir="lib/dist/" regex="apache-solr-dataimporthandler-extras-d.*.jar" />  
  <lib dir="lib/contrib/extraction/lib/" regex=".*.jar" />  
  <lib dir="lib/contrib/clustering/lib/" regex=".*.jar" />  
  <lib dir="lib/contrib/dataimporthandler/lib/" regex=".*.jar" /> 
  <lib dir="lib/contrib/langid/lib/" regex=".*.jar" />  
  <lib dir="lib/contrib/velocity/lib/" regex=".*.jar" />  
  <lib dir="lib/contrib/extraction/lib/" regex="tika-core-d.*.jar" />
  <lib dir="lib/contrib/extraction/lib/" regex="tika-parsers-d.*.jar" /> 

  <dataDir>${solr.data.dir:}</dataDir>

  <directoryFactory name="DirectoryFactory" 
                    class="${solr.directoryFactory:solr.StandardDirectoryFactory}"/>

  <indexConfig>
  </indexConfig>

  <jmx />

  <!-- The default high-performance update handler -->
  <updateHandler class="solr.DirectUpdateHandler2">

  </updateHandler>
  <query>
    <maxBooleanClauses>1024</maxBooleanClauses>

    <filterCache class="solr.FastLRUCache"
                 size="512"
                 initialSize="512"
                 autowarmCount="0"/>

    <queryResultCache class="solr.LRUCache"
                     size="512"
                     initialSize="512"
                     autowarmCount="0"/>

    <documentCache class="solr.LRUCache"
                   size="512"
                   initialSize="512"
                   autowarmCount="0"/>
    <enableLazyFieldLoading>true</enableLazyFieldLoading>
   <queryResultWindowSize>20</queryResultWindowSize>
   <queryResultMaxDocsCached>200</queryResultMaxDocsCached>
    <listener event="newSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
      </arr>
    </listener>
    <listener event="firstSearcher" class="solr.QuerySenderListener">
      <arr name="queries">
        <lst>
          <str name="q">static firstSearcher warming in solrconfig.xml</str>
        </lst>
      </arr>
    </listener>
    <useColdSearcher>false</useColdSearcher>
    <maxWarmingSearchers>2</maxWarmingSearchers>
  </query>
  <requestDispatcher>
    <requestParsers enableRemoteStreaming="true" 
                    multipartUploadLimitInKB="2048000" />
  </requestDispatcher>
  <requestHandler name="/dataimport" class="org.apache.solr.handler.dataimport.DataImportHandler">
    <lst name="defaults">
        <str name="config">data-config.xml</str>
    </lst>
  </requestHandler>
  <requestHandler name="/select" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <int name="rows">100</int>
     </lst>
  </requestHandler>
  <requestHandler name="/browse" class="solr.SearchHandler">
     <lst name="defaults">
       <str name="echoParams">explicit</str>
       <!-- VelocityResponseWriter settings -->
       <str name="wt">velocity</str>
       <str name="v.template">browse</str>
       <str name="v.layout">layout</str>
       <str name="title">Solritas</str>
       <str name="df">text</str>
       <str name="defType">edismax</str>
       <str name="q.alt">*:*</str>
       <str name="rows">10</str>
       <str name="fl">*,score</str>
       <str name="mlt.qf">
         text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
       </str>
       <str name="mlt.fl">text,features,name,sku,id,manu,cat</str>
       <int name="mlt.count">3</int>
       <str name="qf">
          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
       </str>
       <str name="facet">on</str>
       <str name="facet.field">cat</str>
       <str name="facet.field">manu_exact</str>
       <str name="facet.query">ipod</str>
       <str name="facet.query">GB</str>
       <str name="facet.mincount">1</str>
       <str name="facet.pivot">cat,inStock</str>
       <str name="facet.range.other">after</str>
       <str name="facet.range">price</str>
       <int name="f.price.facet.range.start">0</int>
       <int name="f.price.facet.range.end">600</int>
       <int name="f.price.facet.range.gap">50</int>
       <str name="facet.range">popularity</str>
       <int name="f.popularity.facet.range.start">0</int>
       <int name="f.popularity.facet.range.end">10</int>
       <int name="f.popularity.facet.range.gap">3</int>
       <str name="facet.range">manufacturedate_dt</str>
       <str name="f.manufacturedate_dt.facet.range.start">NOW/YEAR-10YEARS</str>
       <str name="f.manufacturedate_dt.facet.range.end">NOW</str>
       <str name="f.manufacturedate_dt.facet.range.gap">+1YEAR</str>
       <str name="f.manufacturedate_dt.facet.range.other">before</str>
       <str name="f.manufacturedate_dt.facet.range.other">after</str>

       <!-- Highlighting defaults -->
       <str name="hl">on</str>
       <str name="hl.fl">text features name</str>
       <str name="f.name.hl.fragsize">0</str>
       <str name="f.name.hl.alternateField">name</str>
     </lst>
     <arr name="last-components">
       <str>spellcheck</str>
     </arr>
  </requestHandler>

  <requestHandler name="/update" 
                  class="solr.XmlUpdateRequestHandler">
    </requestHandler>
  <requestHandler name="/update/javabin" 
                  class="solr.BinaryUpdateRequestHandler" />

  <requestHandler name="/update/csv" 
                  class="solr.CSVRequestHandler" 
                  startup="lazy" />

  <requestHandler name="/update/json" 
                  class="solr.JsonUpdateRequestHandler" 
                  startup="lazy" />

  <requestHandler name="/update/extract" 
                  startup="lazy"
                  class="solr.extraction.ExtractingRequestHandler" >
    <lst name="defaults">
      <!-- All the main content goes into "text"... if you need to return
           the extracted text or do highlighting, use a stored field. -->
      <str name="fmap.content">text</str>
      <str name="lowernames">true</str>
      <str name="uprefix">ignored_</str>
      <!-- capture link hrefs but ignore div attributes -->
      <str name="captureAttr">true</str>
      <str name="fmap.a">links</str>
      <str name="fmap.div">ignored_</str>
    </lst>
  </requestHandler>
  <requestHandler name="/update/xslt"
                   startup="lazy"
                   class="solr.XsltUpdateRequestHandler"/>
  <requestHandler name="/analysis/field" 
                  startup="lazy"
                  class="solr.FieldAnalysisRequestHandler" />
  <requestHandler name="/analysis/document" 
                  class="solr.DocumentAnalysisRequestHandler" 
                  startup="lazy" />
  <!-- Admin Handlers
       Admin Handlers - This will register all the standard admin
       RequestHandlers.  
    -->
  <requestHandler name="/admin/" 
                  class="solr.admin.AdminHandlers" />
  <!-- ping/healthcheck -->
  <requestHandler name="/admin/ping" class="solr.PingRequestHandler">
    <lst name="invariants">
      <str name="q">solrpingquery</str>
    </lst>
    <lst name="defaults">
      <str name="echoParams">all</str>
    </lst>
  </requestHandler>
  <!-- Echo the request contents back to the client -->
  <requestHandler name="/debug/dump" class="solr.DumpRequestHandler" >
    <lst name="defaults">
     <str name="echoParams">explicit</str> 
     <str name="echoHandler">true</str>
    </lst>
  </requestHandler>

  <searchComponent name="spellcheck" class="solr.SpellCheckComponent">
    <str name="queryAnalyzerFieldType">textSpell</str>

    <lst name="spellchecker">
      <str name="name">default</str>
      <str name="field">name</str>
      <str name="spellcheckIndexDir">spellchecker</str>
    </lst>

  </searchComponent>

  <requestHandler name="/spell" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="df">text</str>
      <str name="spellcheck.onlyMorePopular">false</str>
      <str name="spellcheck.extendedResults">false</str>
      <str name="spellcheck.count">1</str>
    </lst>
    <arr name="last-components">
      <str>spellcheck</str>
    </arr>
  </requestHandler>

  <searchComponent name="tvComponent" class="solr.TermVectorComponent"/>

  <requestHandler name="/tvrh" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="df">text</str>
      <bool name="tv">true</bool>
    </lst>
    <arr name="last-components">
      <str>tvComponent</str>
    </arr>
  </requestHandler>

  <searchComponent name="clustering" 
                   enable="${solr.clustering.enabled:false}"
                   class="solr.clustering.ClusteringComponent" >
    <!-- Declare an engine -->
    <lst name="engine">
      <!-- The name, only one can be named "default" -->
      <str name="name">default</str>

      <str name="carrot.algorithm">org.carrot2.clustering.lingo.LingoClusteringAlgorithm</str>

      <str name="LingoClusteringAlgorithm.desiredClusterCountBase">20</str>

      <str name="carrot.lexicalResourcesDir">clustering/carrot2</str>

      <str name="MultilingualClustering.defaultLanguage">ENGLISH</str>
    </lst>
    <lst name="engine">
      <str name="name">stc</str>
      <str name="carrot.algorithm">org.carrot2.clustering.stc.STCClusteringAlgorithm</str>
    </lst>
  </searchComponent>

  <requestHandler name="/clustering"
                  startup="lazy"
                  enable="${solr.clustering.enabled:false}"
                  class="solr.SearchHandler">
    <lst name="defaults">
      <bool name="clustering">true</bool>
      <str name="clustering.engine">default</str>
      <bool name="clustering.results">true</bool>
      <!-- The title field -->
      <str name="carrot.title">name</str>
      <str name="carrot.url">id</str>
      <!-- The field to cluster on -->
       <str name="carrot.snippet">features</str>
       <!-- produce summaries -->
       <bool name="carrot.produceSummary">true</bool>
       <!-- the maximum number of labels per cluster -->
       <!--<int name="carrot.numDescriptions">5</int>-->
       <!-- produce sub clusters -->
       <bool name="carrot.outputSubClusters">false</bool>
       <str name="df">text</str>
       <str name="defType">edismax</str>
       <str name="qf">
          text^0.5 features^1.0 name^1.2 sku^1.5 id^10.0 manu^1.1 cat^1.4
       </str>
       <str name="q.alt">*:*</str>
       <str name="rows">10</str>
       <str name="fl">*,score</str>
    </lst>     
    <arr name="last-components">
      <str>clustering</str>
    </arr>
  </requestHandler>

  <searchComponent name="terms" class="solr.TermsComponent"/>
  <!-- A request handler for demonstrating the terms component -->
  <requestHandler name="/terms" class="solr.SearchHandler" startup="lazy">
     <lst name="defaults">
      <bool name="terms">true</bool>
    </lst>     
    <arr name="components">
      <str>terms</str>
    </arr>
  </requestHandler>
  <searchComponent name="elevator" class="solr.QueryElevationComponent" >
    <!-- pick a fieldType to analyze queries -->
    <str name="queryFieldType">string</str>
    <str name="config-file">elevate.xml</str>
  </searchComponent>
  <!-- A request handler for demonstrating the elevator component -->
  <requestHandler name="/elevate" class="solr.SearchHandler" startup="lazy">
    <lst name="defaults">
      <str name="echoParams">explicit</str>
      <str name="df">text</str>
    </lst>
    <arr name="last-components">
      <str>elevator</str>
    </arr>
  </requestHandler>
  <!-- Highlighting Component
       http://wiki.apache.org/solr/HighlightingParameters
    -->
  <searchComponent class="solr.HighlightComponent" name="highlight">
    <highlighting>
      <!-- Configure the standard fragmenter -->
      <!-- This could most likely be commented out in the "default" case -->
      <fragmenter name="gap" 
                  default="true"
                  class="solr.highlight.GapFragmenter">
        <lst name="defaults">
          <int name="hl.fragsize">100</int>
        </lst>
      </fragmenter>
      <!-- A regular-expression-based fragmenter 
           (for sentence extraction) 
        -->
      <fragmenter name="regex" 
                  class="solr.highlight.RegexFragmenter">
        <lst name="defaults">
          <!-- slightly smaller fragsizes work better because of slop -->
          <int name="hl.fragsize">70</int>
          <!-- allow 50% slop on fragment sizes -->
          <float name="hl.regex.slop">0.5</float>
          <!-- a basic sentence pattern -->
          <str name="hl.regex.pattern">[-w ,/n&quot;&apos;]{20,200}</str>
        </lst>
      </fragmenter>
      <!-- Configure the standard formatter -->
      <formatter name="html" 
                 default="true"
                 class="solr.highlight.HtmlFormatter">
        <lst name="defaults">
          <str name="hl.simple.pre"><![CDATA[<em>]]></str>
          <str name="hl.simple.post"><![CDATA[</em>]]></str>
        </lst>
      </formatter>
      <!-- Configure the standard encoder -->
      <encoder name="html" 
               class="solr.highlight.HtmlEncoder" />
      <!-- Configure the standard fragListBuilder -->
      <fragListBuilder name="simple" 
                       default="true"
                       class="solr.highlight.SimpleFragListBuilder"/>
      <!-- Configure the single fragListBuilder -->
      <fragListBuilder name="single" 
                       class="solr.highlight.SingleFragListBuilder"/>
      <!-- default tag FragmentsBuilder -->
      <fragmentsBuilder name="default" 
                        default="true"
                        class="solr.highlight.ScoreOrderFragmentsBuilder">
        <!-- 
        <lst name="defaults">
          <str name="hl.multiValuedSeparatorChar">/</str>
        </lst>
        -->
      </fragmentsBuilder>
      <!-- multi-colored tag FragmentsBuilder -->
      <fragmentsBuilder name="colored" 
                        class="solr.highlight.ScoreOrderFragmentsBuilder">
        <lst name="defaults">
          <str name="hl.tag.pre"><![CDATA[
               <b style="background:yellow">,<b style="background:lawgreen">,
               <b style="background:aquamarine">,<b style="background:magenta">,
               <b style="background:palegreen">,<b style="background:coral">,
               <b style="background:wheat">,<b style="background:khaki">,
               <b style="background:lime">,<b style="background:deepskyblue">]]></str>
          <str name="hl.tag.post"><![CDATA[</b>]]></str>
        </lst>
      </fragmentsBuilder>
      <boundaryScanner name="default" 
                       default="true"
                       class="solr.highlight.SimpleBoundaryScanner">
        <lst name="defaults">
          <str name="hl.bs.maxScan">10</str>
          <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
        </lst>
      </boundaryScanner>
      <boundaryScanner name="breakIterator" 
                       class="solr.highlight.BreakIteratorBoundaryScanner">
        <lst name="defaults">
          <str name="hl.bs.type">WORD</str>
          <str name="hl.bs.language">en</str>
          <str name="hl.bs.country">US</str>
        </lst>
      </boundaryScanner>
    </highlighting>
  </searchComponent>
  <queryResponseWriter name="json" class="solr.JSONResponseWriter">
    <str name="content-type">text/plain; charset=UTF-8</str>
  </queryResponseWriter>

    <queryResponseWriter name="velocity" class="solr.VelocityResponseWriter" startup="lazy"/>

    -->
  <queryResponseWriter name="xslt" class="solr.XSLTResponseWriter">
    <int name="xsltCacheLifetimeSeconds">5</int>
  </queryResponseWriter>
  <admin>
    <defaultQuery>*:*</defaultQuery>

  </admin>
</config>

架构.xml文件:

<?xml version="1.0" encoding="UTF-8" ?>
<schema name="ktimatologio" version="1.5">
  <types>
    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
    <fieldType name="string" class="solr.StrField" sortMissingLast="true" />
    <!-- boolean type: "true" or "false" -->
    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true"/>
    <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings -->
    <fieldtype name="binary" class="solr.BinaryField"/>
    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" positionIncrementGap="0"/>
    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>
    <fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
    <!-- A Trie based date field for faster date range queries and date faceting. -->
    <fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>
    <fieldType name="pint" class="solr.IntField"/>
    <fieldType name="plong" class="solr.LongField"/>
    <fieldType name="pfloat" class="solr.FloatField"/>
    <fieldType name="pdouble" class="solr.DoubleField"/>
    <fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/>
    <fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>
    <fieldType name="random" class="solr.RandomSortField" indexed="true" />
    <!-- Greek -->
    <fieldType name="text_el" class="solr.TextField" positionIncrementGap="100">
      <analyzer> 
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <!-- greek specific lowercase for sigma -->
        <filter class="solr.GreekLowerCaseFilterFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="false" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
        <filter class="solr.GreekStemFilterFactory"/>
      </analyzer>
    </fieldType>
    <fieldType name="text_ktimatologio" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">       
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPossessiveFilterFactory"/>   
        <filter class="solr.GreekLowerCaseFilterFactory"/>
        <filter class="solr.GreekStemFilterFactory"/>       
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>

      <analyzer type="query">
        <tokenizer class="solr.StandardTokenizerFactory"/>
        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
        <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
        <filter class="solr.GreekLowerCaseFilterFactory"/>
        <filter class="solr.GreekStemFilterFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
        <filter class="solr.EnglishPossessiveFilterFactory"/>
        <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
        <filter class="solr.PorterStemFilterFactory"/>
      </analyzer>
    </fieldType>
    <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />
    <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>
    <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>
    <fieldtype name="geohash" class="solr.GeoHashField"/>
    <fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" />
 </types>

 <fields>
  <field  name="id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="model" type="text_ktimatologio" indexed="true" stored="true" multiValued="false"/>
  <field  name="type" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="url" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="content" type="text_ktimatologio" indexed="true" stored="true" multiValued="true"/>
  <field  name="last_modified" type="string" indexed="true" stored="true"/>
 </fields>
 <uniqueKey>solr_id</uniqueKey>
 <defaultSearchField>content</defaultSearchField>
 <solrQueryParser defaultOperator="OR"/>
   <copyField source="title" dest="content" />
</schema>

数据配置.xml文件:

<dataConfig>
  <dataSource type="JdbcDataSource"
          autoCommit="true" batchSize="-1"
          convertType="false"
          driver="com.mysql.jdbc.Driver"
          url="jdbc:mysql://127.0.0.1:3306/ktimatologio"
          user="root" 
          password="1a2b3c4d"/>
         <dataSource name="fieldReader" type="FieldStreamDataSource" />     

  <document>  
  <entity name="aitiologikes_ektheseis"
    dataSource="db" 
    transformer="HTMLStripTransformer" 
    query="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, body AS content from aitiologikes_ektheseis where type = 'text'"
    deltaImportQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, body AS content from aitiologikes_ektheseis where type = 'text' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, body AS content from aitiologikes_ektheseis where type = 'text' and last_modified &gt; '${dataimporter.last_index_time}'">
        <field column="id" name="id" />          
        <field column="solr_id" name="solr_id" />
        <field column="title" name="title" stripHTML="true" />
        <field column="model" name="model" stripHTML="true"  />
        <field column="type" name="type" stripHTML="true"  />
        <field column="url" name="url" stripHTML="true"  />
        <field column="last_modified" name="last_modified" stripHTML="true"  />
        <field column="content" name="content" stripHTML="true" />
    </entity>
    <entity name="aitiologikes_ektheseis_bin"
      query="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS content from aitiologikes_ektheseis where type = 'bin'" 
      deltaImportQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS content from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
      deltaQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS content from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
      transformer="TemplateTransformer"
      dataSource="db">
        <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.content" format="text">
          <field column="id" name="id" />        
          <field column="solr_id" name="solr_id" />
          <field column="title" name="title" stripHTML="true" />
          <field column="model" name="model" stripHTML="true"  />
          <field column="type" name="type" stripHTML="true"  />
          <field column="url" name="url" stripHTML="true"  />
          <field column="last_modified" name="last_modified" stripHTML="true"  />
          <field column="content" name="content" stripHTML="true" />
        </entity>
    </entity>
  </document>   

</dataConfig>

最后我找到了解决方案。请注意 data-config 中的实体查询和列定义.xml:

.... 
    <entity name="aitiologikes_ektheseis_bin"
      query="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS content from aitiologikes_ektheseis where type = 'bin'" 
      deltaImportQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS content from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
      deltaQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS content from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
      transformer="TemplateTransformer"
      dataSource="db">
        <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.content" format="text">
          <field column="id" name="id" />        
          <field column="solr_id" name="solr_id" />
          <field column="title" name="title" stripHTML="true" />
          <field column="model" name="model" stripHTML="true"  />
          <field column="type" name="type" stripHTML="true"  />
          <field column="url" name="url" stripHTML="true"  />
          <field column="last_modified" name="last_modified" stripHTML="true"  />
          <field column="content" name="content" stripHTML="true" />
        </entity>
    </entity>
  </document>   

</dataConfig>
为了让"Tika"看到"内容

并提取它,我必须将"内容"更改为"文本"。还有一件事。正确的语法是:

<entity name="aitiologikes_ektheseis_bin"
      query="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS text from aitiologikes_ektheseis where type = 'bin'" 
      deltaImportQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
      deltaQuery="select id, title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
      transformer="TemplateTransformer"
      dataSource="db">
          <field column="id" name="id" />        
          <field column="solr_id" name="solr_id" />
          <field column="title" name="title" />
          <field column="model" name="model" />
          <field column="type" name="type" />
          <field column="url" name="url" />
          <field column="last_modified" name="last_modified" />
        <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.text" format="text">  
          <field column="text" name="content" />
        </entity>
    </entity>
  </document>   

</dataConfig>

我希望这对某人有所帮助。身体健康,汤姆

最新更新