我无法运行导入org.apache.lucene.analysis.AnalysisAnalyzer的java程序;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.StringReader;
import java.util.HashMap;
import java.util.Map;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.mahout.classifier.naivebayes.BayesUtils;
import org.apache.mahout.classifier.naivebayes.NaiveBayesModel;
import org.apache.mahout.classifier.naivebayes.StandardNaiveBayesClassifier;
import org.apache.mahout.common.Pair;
import org.apache.mahout.common.iterator.sequencefile.SequenceFileIterable;
import org.apache.mahout.math.RandomAccessSparseVector;
import org.apache.mahout.math.Vector;
import org.apache.mahout.math.Vector.Element;
import org.apache.mahout.vectorizer.DefaultAnalyzer;
import org.apache.mahout.vectorizer.TFIDF;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
public class Classifier {
public static Map<String, Integer> readDictionnary(Configuration conf, Path dictionnaryPath) {
Map<String, Integer> dictionnary = new HashMap<String, Integer>();
for (Pair<Text, IntWritable> pair : new SequenceFileIterable<Text, IntWritable>(dictionnaryPath, true, conf)) {
dictionnary.put(pair.getFirst().toString(), pair.getSecond().get());
}
return dictionnary;
}
public static Map<Integer, Long> readDocumentFrequency(Configuration conf, Path documentFrequencyPath) {
Map<Integer, Long> documentFrequency = new HashMap<Integer, Long>();
for (Pair<IntWritable, LongWritable> pair : new SequenceFileIterable<IntWritable, LongWritable>(documentFrequencyPath, true, conf)) {
documentFrequency.put(pair.getFirst().get(), pair.getSecond().get());
}
return documentFrequency;
}
public static void main(String[] args) throws Exception {
System.out.println("Start time :" + System.currentTimeMillis());
if (args.length < 5) {
System.out.println("Arguments: [model] [label index] [dictionnary] [document frequency] [tweet file]");
return;
}
String modelPath = args[0];
String labelIndexPath = args[1];
String dictionaryPath = args[2];
String documentFrequencyPath = args[3];
String testFilePath = args[4];
Configuration configuration = new Configuration();
// model is a matrix (wordId, labelId) => probability score
NaiveBayesModel model = NaiveBayesModel.materialize(new Path(modelPath), configuration);
StandardNaiveBayesClassifier classifier = new StandardNaiveBayesClassifier(model);
// labels is a map label => classId
Map<Integer, String> labels = BayesUtils.readLabelIndex(configuration, new Path(labelIndexPath));
Map<String, Integer> dictionary = readDictionnary(configuration, new Path(dictionaryPath));
Map<Integer, Long> documentFrequency = readDocumentFrequency(configuration, new Path(documentFrequencyPath));
// analyzer used to extract word from tweet
Analyzer analyzer = new DefaultAnalyzer();
int labelCount = labels.size();
int documentCount = documentFrequency.get(-1).intValue();
System.out.println("Number of labels: " + labelCount);
System.out.println("Number of documents in training set: " + documentCount);
BufferedReader reader = new BufferedReader(new FileReader(testFilePath));
String outputFile = "/home/hduser/result.txt";
FileWriter f1 = new FileWriter(outputFile,true);
BufferedWriter out = new BufferedWriter(f1);
int correctCounter=0;
int totalCounter=0;
while(true)
{
String line = reader.readLine();
if (line == null) {
break;
}
String[] arr = line.split(" ");
String catId = arr[0];
String label = arr[1];
String msg = line.substring(arr[0].length() + arr[1].length() + 2);
Multiset<String> words = ConcurrentHashMultiset.create();
// extract words from Msg
TokenStream ts = analyzer.reusableTokenStream("text", new StringReader(msg));
CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
ts.reset();
int wordCount = 0;
while (ts.incrementToken()) {
if (termAtt.length() > 0) {
String word = ts.getAttribute(CharTermAttribute.class).toString();
Integer wordId = dictionary.get(word);
// if the word is not in the dictionary, skip it
if (wordId != null) {
words.add(word);
wordCount++;
}
}
}
// create vector wordId => weight using tfidf
Vector vector = new RandomAccessSparseVector(10000);
TFIDF tfidf = new TFIDF();
for (Multiset.Entry<String> entry:words.entrySet()) {
String word = entry.getElement();
int count = entry.getCount();
Integer wordId = dictionary.get(word);
Long freq = documentFrequency.get(wordId);
double tfIdfValue = tfidf.calculate(count, freq.intValue(), wordCount, documentCount);
vector.setQuick(wordId, tfIdfValue);
}
// With the classifier, we get one score for each label
// The label with the highest score is the one the tweet is more likely to
// be associated to
Vector resultVector = classifier.classifyFull(vector);
//double bestScore = -Double.MAX_VALUE;
double bestScore =Double.MAX_VALUE;
int bestCategoryId = -1;
String resultStr=catId+" ";
for(Element element: resultVector)
{
int categoryId = element.index();
double score = -1 * element.get();
if (score < bestScore) {
bestScore = score;
bestCategoryId = categoryId;
}
//System.out.print(" " + labels.get(categoryId) + ": " + score);
if(resultStr.equalsIgnoreCase(catId + " "))
{
resultStr=resultStr + labels.get(categoryId) + " " + score;
}
else
{
resultStr=resultStr + " " + labels.get(categoryId) + " " + score;
}
}
try
{
out.write(resultStr);
out.write("n");
}
catch(Exception e)
{
}
//System.out.println(label + " => " + labels.get(bestCategoryId));
out1.write(label + " => " + labels.get(bestCategoryId));
out1.write("n");
totalCounter++;
if(label.equalsIgnoreCase(labels.get(bestCategoryId)))
{
correctCounter++;
System.out.println("correctCounter : " + correctCounter);
}
};
//Close the output stream
System.out.println("correctCounter : " + correctCounter + " TotalCounter :" + totalCounter);
System.out.println("End time :" + System.currentTimeMillis());
System.out.println("Accuracy : " + (double)correctCounter/totalCounter);
out.close();
}
}
我的ubuntu系统上的配置:Hadoop1.2.0、Mahout 0.7、Lucene 1.4.1、Java 1.6
我编译了java程序,没有任何错误,并生成了一个jar文件:
java c -classpath /usr/local/hadoop/*:/usr/local/mahout/*:/usr/local/lucene/* -d Classifier_Class/ Classifier.java
jar -cvf Classify.jar -C Classifier_CLass/ .
当我尝试使用hadoop执行这个jar时,我遇到了以下错误:
hadoop jar Classify.jar {input arguments}
Exception in thread "main" java.lang.NoClassDefFoundError: org/apache/lucene/analysis/Analyzer
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:270)
at org.apache.hadoop.util.RunJar.main(RunJar.java:153)
Caused by: java.lang.ClassNotFoundException: org.apache.lucene.analysis.Analyzer
at java.net.URLClassLoader$1.run(URLClassLoader.java:217)
at java.security.AccessController.doPrivileged(Native Method)
at java.net.URLClassLoader.findClass(URLClassLoader.java:205)
at java.lang.ClassLoader.loadClass(ClassLoader.java:323)
at java.lang.ClassLoader.loadClass(ClassLoader.java:268)
... 3 more
您的问题是,当您构建应用程序的jar时,Lucene的maven依赖项(.jar文件)不包括在您的jar中,也就是说,Java找不到应用程序使用的Lucene类。我在使用相同的堆栈跟踪时遇到了同样的问题。
您应该更好地使用Maven来构建您的项目。
您要做的是用依赖关系构建您的源代码,然后,您编译的应用程序将包括Lucene的jar。
要做到这一点,只需在Maven项目的.pom文件中添加这些代码行
<build>
<plugins>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.4</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
现在,运行您的应用程序:hadoop jar APP_NAME-jar-with-dependencies.jar
以上所有内容都将解决您的问题。