我正试图从Apache Mahout Cookbook中的一个示例合成控制数据中找出Canopy集群。但是,结果不是得到6个簇,而是得到600个——集合中的每个样本一个。
c -0{n=1 c=[0:28.781, 1:34.463, 2:31.338, 3:31.283, 4:28.921, 5:33.760, 6:25.397, 7:27.785, 8:35.248, 9:27.116, 10:32.872, 11:29.217, 12:36.025, 16:32.872, 17:34.117, 18:26.524, 19:27. 624, 21:25.774, 22:29.270, 25:30.733, 26:29.505, 27:33.029, 28:25.040, 32:26 .917, 32:24.344, 33:26.120, 34:34.942, 36:26.631, 37:35.654, 38:28. 48:29 . 34:30 .977, 44:27.044, 44:27. 034, 44:27. 034, 44:27. 034, 44:27. 034, 44:27. 034, 44:26 . 034, 44:27. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 534, 46:26. 235,47:28 .996, 48:32.004, 49:32 .004, 49:31. 096, 48:32.004, 49:31.056,50:34.255、51:28.072 52:28.940、53:35.497 54:29.747,56:31.433,57:24.556 58:33.743 59:25.047 60:34.932]r = []}
c -1{n=1 c=[0:24.892, 1:25.741, 3:27.553, 4:32.822, 5:32. 879, 6:31.593, 7:31.486, 8:35.547, 9:27.952, 10:31.660, 11:27.542, 12:31. 891, 16:27.811, 18:24.488, 22:35. 592, 23:31. 445, 25:33 .472, 28:31.987, 29:33.662, 30:25.551, 31:30.469, 32:33. 670, 34:34.077, 34:28 .598, 34:24 . 35:24 .149, 37:26. 516, 43:24 .791, 44:35.952, 45:26. 516, 46:24.858, 47:25.956, 48:32.836,49:28.532、50:26.346 51:30.621,52:28.986、53:29.405 54:32.558,55:31.021,56:26.642,57:28.433 58:33.656 59:26.424 60:28.466]r = []}
c -2{n=1 c=[0:31.399, 1:30.632, 2:26.398, 3:24. 291,5:28 .549, 6:24.972, 7:32.436, 8:25.224, 9:27.307, 10:31.839, 11:27. 532, 19:34.120, 20:26.934, 21:31.478, 22:35.017, 23:32.385, 24:24.332 .200, 26:31.245, 27:26.681, 28:28 .514, 30:27.309, 31:24.246, 33:26.963, 34:35 . 292,36:32 .611, 36:32 . 613, 37:32 . 456, 43:26. 858, 34:26 .806, 40:35.125, 41:32.629, 42:32 .056, 43:26.358, 44:28.086, 45:31.439,46:27.306、47:29.608 48:35.973,49:34.144、50:27.172 51:33.632,52:26.597,53:25.539,54:32.543,55:25.577,56:29.990 57:31.351 59:33.900 60:29.545]r = []}
c -3{n=1 c=[0:25.774, 2:30.526, 3:35.421, 4:25.603, 5:27.970, 8:25. 970, 9:28.132, 11:29.427 .455, 13:27. 392, 16:29 .958, 19:30. 445, 21:24.304, 22:24.314, 24:35.097, 25:25.368, 28:25. 326, 30:31.626, 31:29.281, 32:34. 22,35:25 .508, 34:32. 367, 31:25 .527, 36:26 .824, 38:27.559, 44:24. 371, 45:27.608, 46:27.843, 47:29.856, 47:32 . 819, 47:29. 819, 49:31 .321, 51:29.385,52:34.334、53:24.738 54:35.769、56:31.873 57:34.205 58:31.156 60:34.629]r = []}
依次类推,直到C-600。
有谁能想出一个原因吗?
我使用
mahout canopy -i $WORK_DIR/sequencefile/synthetic_control.seq -o
$WORK_DIR/output/canopy.output -t1 80 -t2 55
我在hadoop 1.2.1上使用Mahout 0.9。书中的例子是0.9版本的Mahout,调用函数的方式有变化吗?
我甚至尝试使用不同的t1和t2值,但结果是相同的。
谢谢
Canopy用于在Kmeans中创建参数"K"的猜测。它对t1和t2的选择太敏感了,在我看来是没用的。因此,它被弃用。
在Mahout中没有一个很好的替代方案,但是你可以查看流式kmeans或尝试对kmeans的结果使用clusterdump,并以这种方式找到最适合你实际数据的k,寻找最高的内聚和最大的分离。