我正在尝试创建一个模拟蛋白质的GNN。然而,我在GraphConv
中遇到了一个错误(在GCNConv
中也遇到了同样的错误(。我不明白为什么在形状应该可以相乘的情况下会出现这个错误。我认为这个错误一定与我创建的自定义数据集有关,但我不能100%确定。如果你有类似的问题或知道如何解决,请告诉我。非常感谢。
编辑:即使我将embedding_size
更改为1479
,我仍然得到:RuntimeError: mat1 and mat2 shapes cannot be multiplied (1479x1 and 1479x1479)
。
自定义数据集:
class ProteinDataset(geom_data.Dataset):
def __init__(self, root, transform=None, pre_transform=None):
# root = where data set is stored
super(ProteinDataset, self).__init__(root, transform, pre_transform)
self.root = root
@property
def raw_file_names(self):
return os.listdir(f'{self.root}/raw')
@property
def processed_file_names(self):
inxs = []
for pdb in self.raw_paths:
inxs.append(pdb.split('/')[-1].split('.p')[0])
return [f'{i}.pt' for i in inxs]
def download(self):
pass
def process(self):
for pdb in self.raw_paths:
try:
mol_obj = Chem.rdmolfiles.MolFromPDBFile(pdb)
except AttributeError:
os.remove(pdb)
continue
# Get node features
node_feats = self._get_node_features(mol_obj).reshape([-1,1])
# Get edge features
edge_feats = self._get_edge_features(mol_obj).reshape([-1,1])
# Get adjacency info
edge_index = self._get_adjacency_info(mol_obj)
label = self._get_labels(pdb)
# Create Data object
data = geom_data.Data(x=node_feats,
edge_index=edge_index,
edge_attr=edge_feats,
y=label)
i = pdb.split('/')[-1].split('.p')[0]
torch.save(data, os.path.join(self.processed_dir,f'{i}.pt'))
def _get_node_features(self, mol):
all_node_feats = []
for atom in mol.GetAtoms():
all_node_feats.append(atom.GetMass())
all_node_feats = np.asarray(all_node_feats)
return torch.tensor(all_node_feats, dtype=torch.float)
def _get_edge_features(self, mol):
all_edge_feats = []
dists = Chem.rdmolops.Get3DDistanceMatrix(mol)
# CA-CA Distances
for bond in mol.GetBonds():
begin = bond.GetBeginAtomIdx()
end = bond.GetEndAtomIdx()
all_edge_feats.append(dists[begin,end])
all_edge_feats = np.asarray(all_edge_feats)
return torch.tensor(all_edge_feats, dtype=torch.float)
def _get_adjacency_info(self, mol):
adj_matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
row, col = np.where(adj_matrix)
coo = np.array(list(zip(row, col)))
coo = np.reshape(coo, (2, -1))
return torch.tensor(coo, dtype=torch.long)
def _get_labels(self, fn):
with open(fn, 'r') as f:
label = float(f.readline())
f.close()
label = np.asarray([label])
return torch.tensor(label, dtype=torch.float)
def len(self):
return len(self.raw_paths)
def get(self, inx):
data = torch.load(self.processed_paths[inx])
return data
型号:
class GNN(torch.nn.Module):
def __init__(self, feature_size):
super(GNN, self).__init__()
embedding_size = 1024
# GNN Layers
self.conv1 = GraphConv(feature_size, embedding_size)
self.head1 = Linear(embedding_size*3, embedding_size)
self.pool1 = TopKPooling(embedding_size, ratio=0.8)
self.conv2 = GraphConv(embedding_size, embedding_size)
self.head2 = Linear(embedding_size*3, embedding_size)
self.pool2 = TopKPooling(embedding_size, ratio=0.5)
self.conv3 = GraphConv(embedding_size, embedding_size)
self.head3 = Linear(embedding_size*3, embedding_size)
self.pool3 = TopKPooling(embedding_size, ratio=0.2)
# Linear Layers
self.fc1 = Linear(embedding_size*2, 1024)
self.fc2 = Linear(1024, 128)
self.fc3 = Linear(128, 1)
def forward(self, x, edge_attr, edge_index, batch_index):
# First block
x = self.conv1(x, edge_index).relu()
x = self.head1(x)
x, edge_index, edge_attr, batch_index, _, _ = self.pool1(x,
edge_index,
None,
batch_index)
x1 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
# Second block
x = self.conv2(x, edge_index).relu()
x = self.head2(x)
x, edge_index, edge_attr, batch_index, _, _ = self.pool2(x,
edge_index,
None,
batch_index)
x2 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
# Third block
x = self.conv3(x, edge_index).relu()
x = self.head3(x)
x, edge_index, edge_attr, batch_index, _, _ = self.pool3(x,
edge_index,
None,
batch_index)
x3 = torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1)
# Concat pooled vectors
x = x1 + x2 + x3
# Apply Linear Layers
x = self.fc1(x).relu()
x = self.fc2(x).relu()
x = self.fc3(x)
return x
培训:
device = torch.device('cuda')
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
# Loading the dataset
train_set = ProteinDataset(root='data/lys50_2/train')
test_set = ProteinDataset(root='data/lys50_2/test')
print('Shape of input:', train_set[0].x.shape[0])
# Loading the model
model = GNN(feature_size=train_set[0].x.shape[0])
model = model.to(device)
print(f'Number of parameters: {count_parameters(model)}')
print(model)
# Loss and Optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)
print(optimizer)
# Prepare for training
train_loader = DataLoader(train_set, batch_size=1, shuffle=True)
test_loader = DataLoader(test_set, batch_size=1, shuffle=False)
def train(m,opt):
loss_sum = 0.0
for _, batch in enumerate(train_loader):
# Use GPU
batch.to(device)
# Reset grad
opt.zero_grad()
# Pass node features and connections
pred = m(batch.x.float(),
batch.edge_attr.float(),
batch.edge_index,
batch.batch)
# Calculate loss and gradients
loss = loss_fn(pred, batch.y)
loss.backward()
loss_sum += loss.item()
# Update using the gradients
opt.step()
return loss_sum / len(train_loader)
def validate(m):
loss_sum = 0.0
for _, batch in enumerate(test_loader):
for _, batch in enumerate(test_loader):
# Use GPU
batch.to(device)
# No grad
with torch.no_grad():
pred = m(batch.x.float(),
batch.edge_attr.float(),
batch.edge_index,
batch.batch)
# Calculate loss and gradients
loss = loss_fn(pred, batch.y)
loss_sum += loss.item()
return loss_sum / len(test_loader)
model.zero_grad()
optimizer.zero_grad()
# Loop for training
for i in range(101):
loss = train(model,optimizer)
if (i%10==0):
loss_v = validate(model)
print(i, loss, loss_v)
else:
print(i, loss)
运行训练时出错:
Traceback (most recent call last):
File "/home/spencer/sh3/gnn/./train.py", line 79, in <module>
loss = train(model,optimizer)
File "/home/spencer/sh3/gnn/./train.py", line 44, in train
pred = m(batch.x.float(),
File "/home/spencer/miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/feig/s1/spencer/sh3/gnn/model2.py", line 32, in forward
x = self.conv1(x, edge_index).relu()
File "/home/spencer/miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/spencer/miniconda3/lib/python3.9/site-packages/torch_geometric/nn/conv/graph_conv.py", line 71, in forward
out = self.lin_rel(out)
File "/home/spencer/miniconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1102, in _call_impl
return forward_call(*input, **kwargs)
File "/home/spencer/miniconda3/lib/python3.9/site-packages/torch_geometric/nn/dense/linear.py", line 109, in forward
return F.linear(x, self.weight, self.bias)
File "/home/spencer/miniconda3/lib/python3.9/site-packages/torch/nn/functional.py", line 1848, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1479x1 and 1479x1024)
错误告诉您输入形状不匹配。
你可以像这样用正向方法重塑输入:x = x.view(1, 1479)
,但要确保这是你需要的——这个错误通常表明数据集的形状错误或传递了错误的输入。