我一直在尝试在MongoDB中使用MapReduce来完成我认为简单的过程。我不知道这是否是正确的方法,也不知道我是否应该使用MapReduce。我在谷歌上搜索了我想到的关键词,并试图找到我认为最成功的文档,但一无所获。也许我想得太多了?
我有两个集合:details
和gpas
details
是由一大堆(300多万)文档组成的。studentid
元素可以重复两次,每个year
一次,如下所示:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640100"), "classes" : [1,17,19,21], "studentid" : "12345a", "year" : 1}
{ "_id" : ObjectId("4d76b7oij7s2d8372v640100"), "classes" : [2,12,19,22], "studentid" : "98765a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8372v640100"), "classes" : [32,91,101,217], "studentid" : "12345a", "year" : 2}
{ "_id" : ObjectId("4d76b7rty7s2d8372v640100"), "classes" : [1,11,18,22], "studentid" : "24680a", "year" : 1}
{ "_id" : ObjectId("4d49b7oij7s2d8856v640100"), "classes" : [32,99,110,215], "studentid" : "98765a", "year" : 2}
...
gpas
具有与details
具有相同studentid
的元素。每个studentid
只有一个条目,如下所示:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "overall" : 76, "subscore": 2}
...
最后,我想有一个集合,每个学生一行,格式如下:
{ "_id" : ObjectId("4d49b7yah5b6d8372v640111"), "studentid" : "12345a", "classes_1": [1,17,19,21], "classes_2": [32,91,101,217], "overall" : 97, "subscore": 1}
{ "_id" : ObjectId("4f76b7oij7s2d8372v640213"), "studentid" : "98765a", "classes_1": [2,12,19,22], "classes_2": [32,99,110,215], "overall" : 85, "subscore": 5}
{ "_id" : ObjectId("4j49b7oij7s2d8372v640871"), "studentid" : "24680a", "classes_1": [1,11,18,22], "classes_2": [], "overall" : 76, "subscore": 2}
...
我要做的是这样运行MapReduce:
var mapDetails = function() {
emit(this.studentid, {studentid: this.studentid, classes: this.classes, year: this.year, overall: 0, subscore: 0});
};
var mapGpas = function() {
emit(this.studentid, {studentid: this.studentid, classes: [], year: 0, overall: this.overall, subscore: this.subscore});
};
var reduce = function(key, values) {
var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};
values.forEach(function(value) {
if (value.year == 0) {
outs.overall = value.overall;
outs.subscore = value.subscore;
}
else {
if (value.year == 1) {
outs.classes_1 = value.classes;
}
if (value.year == 2) {
outs.classes_2 = value.classes;
}
outs.studentid = value.studentid;
}
});
return outs;
};
res = db.details.mapReduce(mapDetails, reduce, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, reduce, {out: {reduce: 'joined'}})
但当我运行它时,这是我得到的集合:
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 85, "subscore" : 5 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
我缺少类数组。
此外,顺便说一句,我如何访问生成的MapReduce value
元素中的元素?MapReduce是否总是输出到value
或其他名称?
这与MongoDB用户Google Groups上的一个问题类似
https://groups.google.com/group/mongodb-user/browse_thread/thread/60a8b683e2626ada?pli=1
答案参考了一个在线教程,该教程与您的示例类似:http://tebros.com/2011/07/using-mongodb-mapreduce-to-join-2-collections/
有关MongoDB中MapReduce的更多信息,请参阅文档:http://www.mongodb.org/display/DOCS/MapReduce
此外,在MongoDB Cookbook题为"使用版本化文档查找最大值和最小值"的文章的"Extras"部分,有一个关于MapReduce操作如何工作的有用的分步演练:http://cookbook.mongodb.org/patterns/finding_max_and_min/
如果你已经阅读了一些参考文件,请原谅我。我把它们包括在内是为了其他用户的利益,他们可能正在阅读这篇文章,并且是在MongoDB 中使用MapReduce的新手
Map函数中"emit"语句的输出与Reduce函数的输出匹配是很重要的。如果Map函数只输出一个文档,则Reduce函数可能根本无法运行,然后您的输出集合将具有不匹配的文档。
我稍微修改了您的map语句,以便使用两个独立的"类"数组以所需输出的格式发送文档
我还重新编写了reduce语句,将新类添加到classes_1和classes\u2数组中,前提是它们还不存在。
var mapDetails = function(){
var output = {studentid: this.studentid, classes_1: [], classes_2: [], year: this.year, overall: 0, subscore: 0}
if (this.year == 1) {
output.classes_1 = this.classes;
}
if (this.year == 2) {
output.classes_2 = this.classes;
}
emit(this.studentid, output);
};
var mapGpas = function() {
emit(this.studentid, {studentid: this.studentid, classes_1: [], classes_2: [], year: 0, overall: this.overall, subscore: this.subscore});
};
var r = function(key, values) {
var outs = { studentid: "0", classes_1: [], classes_2: [], overall: 0, subscore: 0};
values.forEach(function(v){
outs.studentid = v.studentid;
v.classes_1.forEach(function(class){if(outs.classes_1.indexOf(class)==-1){outs.classes_1.push(class)}})
v.classes_2.forEach(function(class){if(outs.classes_2.indexOf(class)==-1){outs.classes_2.push(class)}})
if (v.year == 0) {
outs.overall = v.overall;
outs.subscore = v.subscore;
}
});
return outs;
};
res = db.details.mapReduce(mapDetails, r, {out: {reduce: 'joined'}})
res = db.gpas.mapReduce(mapGpas, r, {out: {reduce: 'joined'}})
运行两个MapReduce操作会得到以下集合,该集合与您想要的格式相匹配:
> db.joined.find()
{ "_id" : "12345a", "value" : { "studentid" : "12345a", "classes_1" : [ 1, 17, 19, 21 ], "classes_2" : [ 32, 91, 101, 217 ], "overall" : 97, "subscore" : 1 } }
{ "_id" : "24680a", "value" : { "studentid" : "24680a", "classes_1" : [ 1, 11, 18, 22 ], "classes_2" : [ ], "overall" : 76, "subscore" : 2 } }
{ "_id" : "98765a", "value" : { "studentid" : "98765a", "classes_1" : [ 2, 12, 19, 22 ], "classes_2" : [ 32, 99, 110, 215 ], "overall" : 85, "subscore" : 5 } }
>
MapReduce始终以{_id:"id",value:"value"}的形式输出文档在标题为"点符号(触及对象)"的文档中,有更多关于使用子文档的信息:http://www.mongodb.org/display/DOCS/Dot+符号+%28到达+进入+个对象%29
如果您希望MapReduce的输出以不同的格式显示,则必须在应用程序中以编程方式执行此操作。
希望这将提高您对MapReduce的理解,并使您离生成所需的输出集合又近了一步。祝你好运
您不能对此使用m/r,因为它被设计为仅应用于一个集合。从多个集合中读取会破坏分片兼容性,因此是不允许的。您可以使用新的聚合框架(2.1+)或在应用程序中执行您想要的操作。