如何在 .arff 文件中为每列添加"id",以便在 weka 中进行 k 均值聚类



这是我的arff文件(links.arff):

@relation links
@attribute isLink1Present numeric
@attribute isLink2Present numeric
@attribute isLink3Present numeric
@attribute isLink4Present numeric
@attribute isLink6Present numeric
@attribute isLink7Present numeric
@attribute isLink8Present numeric
@attribute isLink9Present numeric
@data
0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0
1,0,1,0,0,0,0,0,0
1,0,1,1,0,0,0,0,0
1,1,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0
1,1,1,0,0,0,0,0,0
1,1,1,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0,0
1,0,0,1,1,0,0,0,0
1,0,1,0,0,0,0,0,0
1,0,1,0,1,0,0,0,0
1,0,1,1,0,0,0,0,0
1,0,1,1,1,0,0,0,0
1,1,0,0,0,0,0,0,0
1,1,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,0
1,1,0,1,1,0,0,0,0
1,1,1,0,0,0,0,0,0
1,1,1,0,1,0,0,0,0
1,1,1,1,0,0,0,0,0
1,1,1,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0
1,0,0,0,1,0,0,0,0
1,0,0,0,1,1,0,0,0
1,0,0,1,0,0,0,0,0
1,0,0,1,0,1,0,0,0
1,0,0,1,1,0,0,0,0
1,0,0,1,1,1,0,0,0
1,0,1,0,0,0,0,0,0
1,0,1,0,0,1,0,0,0
1,0,1,0,1,0,0,0,0
1,0,1,0,1,1,0,0,0
1,0,1,1,0,0,0,0,0
1,0,1,1,0,1,0,0,0
1,0,1,1,1,0,0,0,0
1,0,1,1,1,1,0,0,0
1,1,0,0,0,0,0,0,0
1,1,0,0,0,1,0,0,0
1,1,0,0,1,0,0,0,0
1,1,0,0,1,1,0,0,0
1,1,0,1,0,0,0,0,0
1,1,0,1,0,1,0,0,0
1,1,0,1,1,0,0,0,0
1,1,0,1,1,1,0,0,0
1,1,1,0,0,0,0,0,0
1,1,1,0,0,1,0,0,0
1,1,1,0,1,0,0,0,0
1,1,1,0,1,1,0,0,0
1,1,1,1,0,0,0,0,0
1,1,1,1,0,1,0,0,0
1,1,1,1,1,0,0,0,0
1,1,1,1,1,1,0,0,0
1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0
1,0,0,0,0,1,1,0,0
1,0,0,0,1,0,0,0,0
1,0,0,0,1,0,1,0,0
1,0,0,0,1,1,0,0,0
1,0,0,0,1,1,1,0,0
1,0,0,1,0,0,0,0,0
1,0,0,1,0,0,1,0,0
1,0,0,1,0,1,0,0,0
1,0,0,1,0,1,1,0,0
1,0,0,1,1,0,0,0,0
1,0,0,1,1,0,1,0,0
1,0,0,1,1,1,0,0,0
1,0,0,1,1,1,1,0,0
1,0,1,0,0,0,0,0,0
1,0,1,0,0,0,1,0,0
1,0,1,0,0,1,0,0,0
1,0,1,0,0,1,1,0,0
1,0,1,0,1,0,0,0,0
1,0,1,0,1,0,1,0,0
1,0,1,0,1,1,0,0,0
1,0,1,0,1,1,1,0,0
1,0,1,1,0,0,0,0,0
1,0,1,1,0,0,1,0,0
1,0,1,1,0,1,0,0,0
1,0,1,1,0,1,1,0,0
1,0,1,1,1,0,0,0,0
1,0,1,1,1,0,1,0,0
1,0,1,1,1,1,0,0,0
1,0,1,1,1,1,1,0,0
1,1,0,0,0,0,0,0,0
1,1,0,0,0,0,1,0,0
1,1,0,0,0,1,0,0,0
1,1,0,0,0,1,1,0,0

以下是我如何实现k-均值:

public void runKMeans(int numClusters){
    try {
        SimpleKMeans kmeans = new SimpleKMeans();
        //DistanceFunction df = new weka.core.ManhattanDistance();
        DistanceFunction df = new weka.core.EuclideanDistance();
        kmeans.setDistanceFunction(df);
        kmeans.setSeed(10);
        kmeans.setPreserveInstancesOrder(true);
        kmeans.setNumClusters(numClusters);
        String arffFile = new PropertyUtils().getProperty("datafiles-home")+"\links.arff";
        DataSource source = new DataSource(arffFile);
        Instances instances = source.getDataSet();
        //inst.setDataset(instances);
        kmeans.buildClusterer(instances);
        System.out.println(kmeans.displayStdDevsTipText());
        // This array returns the cluster number (starting with 0) for each instance
        // The array has as many elements as the number of instances
        int[] assignments = kmeans.getAssignments();
        int i=0;
        List<Cluster> lc = new ArrayList<Cluster>();
        for(int clusterNum : assignments) {
            lc.add(new Cluster((i+1) , clusterNum));
          //  System.out.println("Instance "+(i+1)+" -> Cluster "+clusterNum);
            i++;
        }
        Collections.sort(lc);
        for(Cluster c : lc){
            PrintUtils.println("Instance : "+c.getInstance()+" Cluster "+c.getCluster());
        }
        }
        catch(Exception e){
            e.printStackTrace();
        }
}

我想将每一列数据与一个"name"属性相关联,这样我就可以识别每一列。我怎样才能做到这一点?我不认为我可以向@data添加String属性,因为这会影响k-means算法的实现?还有其他方法吗?

是的,您可以添加一个额外的属性来命名实例。

然后,对于EuclideanDistance,可以使用-R选项或setAttributeIndices来决定在计算距离时要使用的属性范围。删除name属性即可!

最新更新