我不明白为什么我的map reduce作业的第一个输出是0和null输出为:url;访问次数
这里是映射器类:
public class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable>
{
public void map(LongWritable cle, Text valeur, Context sortie)
throws IOException
{
String url="";
int nbVisites=0;
Pattern httplogPattern = Pattern.compile("([^\s]+) - - \[(.+)\] "([^\s]+) (/[^\s]*) HTTP/[^\s]+" [^\s]+ ([0-9]+)");
String ligne = valeur.toString();
if (ligne.length()>0) {
Matcher matcher = httplogPattern.matcher(ligne);
if (matcher.matches()) {
url = matcher.group(1);
nbVisites = Integer.parseInt(matcher.group(5));
}
}
Text urlText = new Text(url);
IntWritable value = new IntWritable(nbVisites);
try
{
sortie.write(urlText, value);
System.out.println(urlText + " ; " + value);
}
catch (InterruptedException e)
{
e.printStackTrace();
}
}
和reducer:
public class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable>
{
public void reduce(Text key, Iterable<IntWritable> values, Context sortie) throws IOException, InterruptedException
{
Iterator<IntWritable> it = values.iterator();
int nb=0;
while (it.hasNext()) {
nb = nb + it.next().get();
}
try {
sortie.write(key, new IntWritable(nb));
System.out.println(key.toString() + ";" + nb);
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
输入文件的每一行看起来像这样:
199.72.81.55 - - [01/Jul/1995:00:00:01 -0400] "GET /history/apollo/ HTTP/1.0" 200 6245
,下面是输出:
0
04-dynamic-c.rotterdam.luna.net 4
06-dynamic-c.rotterdam.luna.net 1
10.salc.wsu.edu 3
11.ts2.mnet.medstroms.se 1
128.100.183.222 4
128.102.149.149 4
可以看到,第一行是一对空值
谢谢
你得到一个空键(非空),因为默认的映射器Text是一个空字符串。然后减速机将其计数为0
…
如果您在编写输出
之前检查您的行是否匹配,则可以正常工作。这是你的代码的重构版本
public class WebLogDriver extends Configured implements Tool {
public static final String APP_NAME = WebLogDriver.class.getSimpleName();
public static void main(String[] args) throws Exception {
final int status = ToolRunner.run(new Configuration(), new WebLogDriver(), args);
System.exit(status);
}
@Override
public int run(String[] args) throws Exception {
Configuration conf = getConf();
Job job = Job.getInstance(conf, APP_NAME);
job.setJarByClass(WebLogDriver.class);
// outputs for mapper and reducer
job.setOutputKeyClass(Text.class);
// setup mapper
job.setMapperClass(WebLogDriver.WebLogMapper.class);
job.setMapOutputValueClass(IntWritable.class);
// setup reducer
job.setReducerClass(WebLogDriver.WebLogReducer.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
final Path outputDir = new Path(args[1]);
FileOutputFormat.setOutputPath(job, outputDir);
return job.waitForCompletion(true) ? 0 : 1;
}
static class WebLogMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
static final Pattern HTTP_LOG_PATTERN = Pattern.compile("(\S+) - - \[(.+)] "(\S+) (/\S*) HTTP/\S+" \S+ (\d+)");
final Text keyOut = new Text();
final IntWritable valueOut = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
String line = value.toString();
if (line.isEmpty()) return;
Matcher matcher = HTTP_LOG_PATTERN.matcher(line);
if (matcher.matches()) {
keyOut.set(matcher.group(1));
try {
valueOut.set(Integer.parseInt(matcher.group(5)));
context.write(keyOut, valueOut);
} catch (NumberFormatException e) {
e.printStackTrace();
}
}
}
}
static class WebLogReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
static final IntWritable valueOut = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int nb = StreamSupport.stream(values.spliterator(), true)
.mapToInt(IntWritable::get)
.sum();
valueOut.set(nb);
context.write(key, valueOut);
}
}
}