内存错误,仅仅是因为 ram 还是嵌套



我有一个关于在python中使用DBSCAN对NSL KDD数据进行聚类的研究。当我尝试使用限制 10.000 个数据运行程序时,它说 MemoryError,而在运行所有数据(NSL KDD 有 125.973 行,41 列(时,它说达到最大维度。只是因为计算机规格的问题(我使用的是8GB RAM(还是代码的问题?如何解决这个问题?最后,如何更新要保存在mySQL中的每一行的集群结果?我是蟒蛇新手,如果你认为我问了一个愚蠢的问题,我很抱歉

def set2List(NumpyArray):
list = []
for item in NumpyArray:
list.append(item.tolist())
return list 
def GenerateData():
mydb = pymysql.connect(
host="localhost",user="root", password="", database="ta")
mycursor = mydb.cursor()
mycursor.execute("SELECT * FROM data_trans LIMIT 10000")
myresult = mycursor.fetchall() 
final_result= numpy.array(myresult)
return final_result
def DBSCAN(Dataset, Epsilon,MinumumPoints,DistanceMethod = 'euclidean'):
m,n=Dataset.shape
Visited=numpy.zeros(m,'int')
Type=numpy.zeros(m)
ClustersList=[]
Cluster=[]
PointClusterNumber=numpy.zeros(m)
PointClusterNumberIndex=1
PointNeighbors=[]
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
for i in xrange(m):
if Visited[i]==0:
Visited[i]=1
PointNeighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
if len(PointNeighbors)<MinumumPoints:
Type[i]=-1
else:
for k in xrange(len(Cluster)):
Cluster.pop()
Cluster.append(i)
PointClusterNumber[i]=PointClusterNumberIndex               
PointNeighbors=set2List(PointNeighbors)    
ExpandClsuter(Dataset[i], PointNeighbors,Cluster,MinumumPoints,Epsilon,Visited,DistanceMatrix,PointClusterNumber,PointClusterNumberIndex  )
Cluster.append(PointNeighbors[:])
ClustersList.append(Cluster[:])
PointClusterNumberIndex=PointClusterNumberIndex+1
return PointClusterNumber 
def ExpandClsuter(PointToExapnd, PointNeighbors, Cluster, MinumumPoints, Epsilon, Visited, DistanceMatrix, PointClusterNumber, PointClusterNumberIndex  ):
Neighbors=[] 
for i in PointNeighbors:
if Visited[i]==0:
Visited[i]=1
Neighbors=numpy.where(DistanceMatrix[i]<Epsilon)[0]
if len(Neighbors)>=MinumumPoints: 
for j in Neighbors:
try:
PointNeighbors.index(j)
except ValueError:
PointNeighbors.append(j)
if PointClusterNumber[i]==0:
Cluster.append(i)
PointClusterNumber[i]=PointClusterNumberIndex
return
Data=GenerateData()
fig = plt.figure()
ax1=fig.add_subplot(2,1,1) #row, column, figure number
ax1.scatter(Data[:,0],Data[:,1], alpha =  0.5 ) 
Epsilon=300
MinumumPoints=50
result =DBSCAN(Data,Epsilon,MinumumPoints) 
print result     
plt.show()

错误信息:

Traceback (most recent call last):
File "<ipython-input-8-20458e6efb7c>", line 1, in <module>
runfile('C:/Users/Ji Min/Downloads/oprek.py', wdir='C:/Users/Ji Min/Downloads')
File "C:UsersJi MinAnaconda2libsite-packagesspyderutilssitesitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:UsersJi MinAnaconda2libsite-packagesspyderutilssitesitecustomize.py", line 87, in execfile
exec(compile(scripttext, filename, 'exec'), glob, loc)
File "C:/Users/Ji Min/Downloads/oprek.py", line 95, in <module>
result =DBSCAN(Data,Epsilon,MinumumPoints)
File "C:/Users/Ji Min/Downloads/oprek.py", line 44, in DBSCAN
DistanceMatrix = scipy.spatial.distance.squareform(scipy.spatial.distance.pdist(Dataset, DistanceMethod))
File "C:UsersJi MinAnaconda2libsite-packagesscipyspatialdistance.py", line 1652, in pdist
dm = np.empty((m * (m - 1)) // 2, dtype=np.double)
MemoryError

关键是不计算距离矩阵

距离矩阵需要太多内存。

但无论如何,该数据集是无用的。您计算的距离毫无意义,所以不要指望聚类会比这好得多......

最新更新