我有一个学生成绩的rdd,我需要首先按第一列(大学)对它们进行分组,然后像这样显示每门课程的平均学生人数。执行此查询的最简单方法是什么?
+----------+-------------------+
|university| avg of students |
+----------+--------------------+
| MIT| 3 |
| Cambridge| 2.66
这是数据集。
case class grade(university: String, courseId: Int, studentId: Int, grade: Double)
val grades = List(grade(
grade("Cambridge", 1, 1001, 4),
grade("Cambridge", 1, 1004, 4),
grade("Cambridge", 2, 1006, 3.5),
grade("Cambridge", 2, 1004, 3.5),
grade("Cambridge", 2, 1002, 3.5),
grade("Cambridge", 3, 1006, 3.5),
grade("Cambridge", 3, 1007, 5),
grade("Cambridge", 3, 1008, 4.5),
grade("MIT", 1, 1001, 4),
grade("MIT", 1, 1002, 4),
grade("MIT", 1, 1003, 4),
grade("MIT", 1, 1004, 4),
grade("MIT", 1, 1005, 3.5),
grade("MIT", 2, 1009, 2))
1)第一组:大学
2)然后得到每所大学的课程数
3) then groupBy courseId
4)然后得到每门课的学生人数
grades.groupBy(_.university).map { case (k, v) =>
val courseCount = v.map(_.courseId).distinct.length
val studentCountPerCourse = v.groupBy(_.courseId).map { case (k, v) => v.length }.sum
k -> (studentCountPerCourse.toDouble / courseCount.toDouble)
}
Scala REPL scala> val grades = List(
grade("Cambridge", 1, 1001, 4),
grade("Cambridge", 1, 1004, 4),
grade("Cambridge", 2, 1006, 3.5),
grade("Cambridge", 2, 1004, 3.5),
grade("Cambridge", 2, 1002, 3.5),
grade("Cambridge", 3, 1006, 3.5),
grade("Cambridge", 3, 1007, 5),
grade("Cambridge", 3, 1008, 4.5),
grade("MIT", 1, 1001, 4),
grade("MIT", 1, 1002, 4),
grade("MIT", 1, 1003, 4),
grade("MIT", 1, 1004, 4),
grade("MIT", 1, 1005, 3.5),
grade("MIT", 2, 1009, 2))
// grades: List[grade] = List(...)
scala> grades.groupBy(_.university).map { case (k, v) =>
val courseCount = v.map(_.courseId).distinct.length
val studentCountPerCourse = v.groupBy(_.courseId).map { case (k, v) => v.length }.sum
k -> (studentCountPerCourse.toDouble / courseCount.toDouble)
}
// res2: Map[String, Double] = Map("MIT" -> 3.0, "Cambridge" -> 2.6666666666666665)
gradesRdd.map({ case Grade(university: String, courseId: Int, studentId: Int, gpa: Int) =>
((university),(courseId))}).mapValues(x => (x, 1))
.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2))
.mapValues(y => 1.0 * y._1 / y._2).collect
res73: Array[(String, Double)] = Array((Cambridge,2.125), (MIT,1.1666666666666667))