我已经使用Sphinx-4和HTK模型一起构建了一个在线手写识别系统。当我使用HTK自己的解码系统时,识别率是89%。然而,具有相同HTK型号的Sphinx-4系统实现了略高于50%的性能。
这正常吗?使用HTK模型,Sphinx-4的精度还有提高的空间吗?
感谢
附加信息:
我使用LatticeDemo示例构建了Sphinx-4系统,并遵循中的步骤[http://nshmyrev.blogspot.com.tr/2009/09/using-htk-models-in-sphinx4.html]。HTK系统的配置文件如下:
TARGETKIND = MFCC
TARGETRATE = 1
NUMCEPS = 12
ENORMALISE = F
NATURALBYTEORDER = TRUE
NONUMESCAPES = TRUE
Sphinx-4的config.xml如下所示:
<?xml version="1.0" encoding="UTF-8"?>
<!-- ******************************************************** -->
<!-- biship configuration file -->
<!-- ******************************************************** -->
<config>
<!-- ******************************************************** -->
<!-- frequently tuned properties -->
<!-- ******************************************************** -->
<property name="absoluteBeamWidth" value="0"/>
<property name="relativeBeamWidth" value="1E-50"/>
<property name="absoluteWordBeamWidth" value="20"/>
<property name="relativeWordBeamWidth" value="1E-60"/>
<property name="wordInsertionProbability" value="1E-16"/>
<property name="languageWeight" value="15"/>
<property name="silenceInsertionProbability" value=".0"/>
<property name="frontend" value="epFrontEnd"/>
<property name="recognizer" value="recognizer"/>
<property name="showCreations" value="false"/>
<!-- ******************************************************** -->
<!-- word recognizer configuration -->
<!-- ******************************************************** -->
<component name="recognizer" type="edu.cmu.sphinx.recognizer.Recognizer">
<property name="decoder" value="decoder"/>
<propertylist name="monitors">
<item>accuracyTracker </item>
<item>speedTracker </item>
<item>memoryTracker </item>
<item>recognizerMonitor </item>
</propertylist>
</component>
<!-- ******************************************************** -->
<!-- The Decoder configuration -->
<!-- ******************************************************** -->
<component name="decoder" type="edu.cmu.sphinx.decoder.Decoder">
<property name="searchManager" value="searchManager"/>
<property name=" " value="50"/> -->
</component>
<!-- ******************************************************** -->
<!-- The Search Manager -->
<!-- ******************************************************** -->
<component name="searchManager"
type="edu.cmu.sphinx.decoder.search.SimpleBreadthFirstSearchManager">
<property name="logMath" value="logMath"/>
<property name="linguist" value="flatLinguist"/>
<property name="pruner" value="trivialPruner"/>
<property name="scorer" value="threadedScorer"/>
<property name="activeListFactory" value="activeList"/>
</component>
<!-- ******************************************************** -->
<!-- The Active Lists -->
<!-- ******************************************************** -->
<component name="activeList"
type="edu.cmu.sphinx.decoder.search.PartitionActiveListFactory">
<property name="logMath" value="logMath"/>
<property name="absoluteBeamWidth" value="${absoluteBeamWidth}"/>
<property name="relativeBeamWidth" value="${relativeBeamWidth}"/>
</component>
<!-- ******************************************************** -->
<!-- The Pruner -->
<!-- ******************************************************** -->
<component name="trivialPruner"
type="edu.cmu.sphinx.decoder.pruner.SimplePruner"/>
<!-- ******************************************************** -->
<!-- TheScorer -->
<!-- ******************************************************** -->
<component name="threadedScorer" type="edu.cmu.sphinx.decoder.scorer.ThreadedAcousticScorer">
<property name="frontend" value="${frontend}"/>
</component>
<!-- ******************************************************** -->
<!-- The linguist configuration -->
<!-- ******************************************************** -->
<component name="flatLinguist" type="edu.cmu.sphinx.linguist.flat.FlatLinguist">
<property name="logMath" value="logMath"/>
<property name="grammar" value="jsgfGrammar"/>
<property name="acousticModel" value="wsj"/>
<property name="wordInsertionProbability" value="${wordInsertionProbability}"/>
<property name="languageWeight" value="${languageWeight}"/>
<property name="unitManager" value="unitManager"/>
</component>
<component name="jsgfGrammar" type="edu.cmu.sphinx.jsgf.JSGFGrammar">
<property name="grammarLocation" value="/home/efbilgin/HMM-Exp/HTK-9_Feats_5v/sphinx/"/>
<property name="dictionary" value="dictionary"/>
<property name="grammarName" value="word"/>
<property name="logMath" value="logMath"/>
<property name="addSilenceWords" value="false"/>
<property name="addFillerWords" value="false"/>
</component>
<!-- ******************************************************** -->
<!-- The Dictionary configuration -->
<!-- ******************************************************** -->
<component name="dictionary"
type="edu.cmu.sphinx.linguist.dictionary.FastDictionary">
<property name="dictionaryPath" value="/home/efbilgin/HMM-Exp/HTK-9_Feats_5v/sphinx/7266.dic"/>
<property name="fillerPath" value="home/efbilgin/HMM-Exp/HTK-9_Feats_5v/sphinx/7266.filler"/>
<property name="addSilEndingPronunciation" value="false"/>
<property name="wordReplacement" value="<sil>"/>
<property name="allowMissingWords" value="true"/>
</component>
<!-- ******************************************************** -->
<!-- The acoustic model configuration -->
<!-- ******************************************************** -->
<component name="wsj" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.TiedStateAcousticModel">
<property name="loader" value="wsjLoader"/>
<property name="unitManager" value="unitManager"/>
</component>
<component name="wsjLoader" type="edu.cmu.sphinx.linguist.acoustic.tiedstate.HTKLoader"> <!--Sphinx3Loader">-->
<property name="logMath" value="logMath"/>
<property name="unitManager" value="unitManager"/>
<property name="modelDefinition" value="/home/efbilgin/HMM-Exp/UNIPEN_1000_10_Eq_J-1/hmm7/hmmdefs"/>
</component>
<!-- ******************************************************** -->
<!-- The unit manager configuration -->
<!-- ******************************************************** -->
<component name="unitManager"
type="edu.cmu.sphinx.linguist.acoustic.UnitManager"/>
<!-- ******************************************************** -->
<!-- The frontend configuration -->
<!-- ******************************************************** -->
<component name="epFrontEnd" type="edu.cmu.sphinx.frontend.FrontEnd">
<propertylist name="pipeline">
<item>audioFileDataSource </item>
<item>dataBlocker </item>
<item>speechClassifier </item>
<item>speechMarker </item>
<item>nonSpeechDataFilter </item>
<item>preemphasizer </item>
<item>windower </item>
<item>fft </item>
<item>melFilterBank </item>
<item>dct </item>
<item>liveCMN </item>
<item>featureExtraction </item>
<item>streamHTKSource</item>
</propertylist>
</component>
<component name="streamHTKSource" type="edu.cmu.sphinx.frontend.util.StreamHTKCepstrum">
<property name="cepstrumLength" value="39"/>
</component>
<component name="audioFileDataSource" type="edu.cmu.sphinx.frontend.util.AudioFileDataSource"/>
<component name="microphone" type="edu.cmu.sphinx.frontend.util.Microphone">
<property name="closeBetweenUtterances" value="false"/>
</component>
<component name="dataBlocker" type="edu.cmu.sphinx.frontend.DataBlocker"/>
<component name="speechClassifier" type="edu.cmu.sphinx.frontend.endpoint.SpeechClassifier">
<property name="threshold" value="1"/>
</component>
<component name="nonSpeechDataFilter" type="edu.cmu.sphinx.frontend.endpoint.NonSpeechDataFilter"/>
<component name="speechMarker" type="edu.cmu.sphinx.frontend.endpoint.SpeechMarker">
<property name="speechTrailer" value="50"/>
</component>
<component name="preemphasizer" type="edu.cmu.sphinx.frontend.filter.Preemphasizer"/>
<component name="windower" type="edu.cmu.sphinx.frontend.window.RaisedCosineWindower"/>
<component name="fft" type="edu.cmu.sphinx.frontend.transform.DiscreteFourierTransform"/>
<component name="melFilterBank" type="edu.cmu.sphinx.frontend.frequencywarp.MelFrequencyFilterBank"/>
<component name="dct" type="edu.cmu.sphinx.frontend.transform.DiscreteCosineTransform"/>
<component name="liveCMN" type="edu.cmu.sphinx.frontend.feature.LiveCMN"/>
<component name="featureExtraction" type="edu.cmu.sphinx.frontend.feature.DeltasFeatureExtractor"/>
<!-- ******************************************************* -->
<!-- monitors -->
<!-- ******************************************************* -->
<component name="accuracyTracker" type="edu.cmu.sphinx.instrumentation.BestPathAccuracyTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showRawResults" value="false"/>
<property name="showAlignedResults" value="false"/>
</component>
<component name="memoryTracker" type="edu.cmu.sphinx.instrumentation.MemoryTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="showDetails" value="false"/>
<property name="showSummary" value="false"/>
</component>
<component name="speedTracker" type="edu.cmu.sphinx.instrumentation.SpeedTracker">
<property name="recognizer" value="${recognizer}"/>
<property name="frontend" value="${frontend}"/>
<property name="showDetails" value="false"/>
</component>
<component name="recognizerMonitor" type="edu.cmu.sphinx.instrumentation.RecognizerMonitor">
<property name="recognizer" value="${recognizer}"/>
<propertylist name="allocatedMonitors">
<item>configMonitor </item>
</propertylist>
</component>
<component name="configMonitor" type="edu.cmu.sphinx.instrumentation.ConfigMonitor">
<property name="showConfig" value="false"/>
</component>
<!-- ******************************************************* -->
<!-- Miscellaneous components -->
<!-- ******************************************************* -->
<component name="logMath" type="edu.cmu.sphinx.util.LogMath">
<property name="logBase" value="1.0001"/>
<property name="useAddTable" value="true"/>
</component>
</config>
好吧,性能问题很难检测,你可以做几件事:
1) 增加光束以确保您不会出现修剪的问题
2) 分析单词错误。你有插入或删除的大部分内容吗。最常混淆的单词是什么
3) Sphinx4是一个语音识别器,例如,它会自动向语法中添加填充词。它在最新版本中也有背景噪声循环。确保您禁用了这些。例如,Sphinx4 HTK加载程序在模型中需要特定的符号。
4) 确保s4和HTK中的特征提取是相同的。