我正在Linux服务器上工作,但使用g ++运行此c ++脚本。编程相当新。我正在尝试创建一个基因型vector<string> Geno
字符串的载体。我从中获取数据的文件如下所示:
chr38 12632 C T 0/0:42,0:42:PASS:98 0/1:27,12:39:PASS:99 0/0:49,0:49:PASS:99
chr38 13825 G T 0/1:37,13:50:PASS:99 0/1:28,9:37:PASS:99 0/0:46,0:46:PASS:99
chr38 17160 C T 0/0:23,0:23:PASS:43 0/0:13,0:13:PASS:42 0/0:11,0:11:PASS:41
chr38 17451 G A 0/0:22,0:22:PASS:61 0/1:13,12:25:PASS:99 0/0:9,0:9:PASS:28
chr38 19444 G A 0/0:8,0:8:PASS:22 0/1:8,9:17:PASS:99 0/0:20,0:20:PASS:61
列是染色体,位置,然后是参考和替代等位基因。以下 3 列是这些位置的狗基因组。我创建一个Map.file
:
chr38 12632 C T
chr38 13825 G T
chr38 17160 C T
chr38 17451 G A
chr38 19444 G A
这留下了剩余的文件:
0/0:42,0:42:PASS:98 0/1:27,12:39:PASS:99 0/0:49,0:49:PASS:99
0/1:37,13:50:PASS:99 0/1:28,9:37:PASS:99 0/0:46,0:46:PASS:99
0/0:23,0:23:PASS:43 0/0:13,0:13:PASS:42 0/0:11,0:11:PASS:41
0/0:22,0:22:PASS:61 0/1:13,12:25:PASS:99 0/0:9,0:9:PASS:28
0/0:8,0:8:PASS:22 0/1:8,9:17:PASS:99 0/0:20,0:20:PASS:61
所有当前代码都包含 Bob 的第一次编辑:
#include <iostream>
#include <stdio.h>
#include <fstream>
#include <cstring>
#include <string>
#include <vector>
#include <cstdlib>
#include <sstream>
#include <iomanip>
using namespace std;
//function to get genotype for one SNP
char con_hap2geno(const string & str)
{
char hap1 = str[0];
char hap2 = str[2];
if(hap1 =='0' && hap2 =='0')
return '0' ;
else
if(hap1 == '0' & hap2 =='1')
return '1';
else
if(hap1 =='1' & hap2 =='0')
return '1';
else
if(hap1== '1' & hap2 == '1')
return '2';
else
return '5';
}//end of function
struct raw_data{vector<string> s_geno;};//place to store dog columns
int main()
{
fstream checkmarkermap; checkmarkermap.open("Mapfile", std::fstream::out | std::fstream::trunc); checkmarkermap.close();
string line;
ifstream infile;
infile.open("Chr38_3d_5snp.vfc");
if(infile.fail()) {cout << "Error Opening Filen";}
vector<raw_data>gen_file;
while(getline(infile,line))
{
size_t pos = line.find("t", 0);
string Chrom = line.substr(0,pos);
line.erase(0, pos +1);
pos = line.find("t",0);
string position = line.substr(0,pos);
line.erase(0,pos+1);
pos = line.find("t",0);
string REF = line.substr(0,pos);
line.erase(0,pos+1);
pos = line.find("t",0);
string ALT = line.substr(0,pos);
line.erase(0,pos+1);
stringstream ssline{line};
raw_data gData;
string temp;
while(ssline>>temp)
{
gData.s_geno.push_back(temp);
}
gen_file.push_back(gData);
ofstream output2("Mapfile", std::ios_base::app | std::ios_base::out);//app appends, creates file
output2 << Chrom << " " << position << " " << REF << " " << ALT << endl; //map file
}//end of getline while
int n_dogs= gen_file[0].s_geno.size();
vector<string>f_genos(n_dogs);
for (int i=0; i<n_dogs; ++i)
{
string genome;
//"if need to reserve memory space". genome.reserve("size needed")
for (auto & line_of_data; gen_file)
{
genome += con_hap2geno(line_of_data.s_geno[i]);
}
f_genos[i] = genome;
}//end n_dogs file
//create final genome file
ofstream dogfile("Chr38_3d_5snp_genome");
for (auto & i : f_genos){
dogfile<<i<<'n'; }
return 0
}
我让它工作到(第 100 行(int n_dogs
后的第二个 for 语句。我收到错误declaration of auto & line_of_file has no intializer.
我认为这是因为line_of_file
不是声明的变量,但不确定将其更改为什么。
我正在尝试访问如下所示的文件:
01000
11011
00000
这将对应于每只狗的基因型。我想一旦我克服了这一点,我就会有更多的问题。目前在同一行我有错误could not convert gen_file to bool.
但我认为这与第一个错误有关。
编辑 2
只是关于您发布的代码的几件事。
如果您知道stringstream
并且将其用于最后一列,为什么不将其用于整行?我认为使用这样的代码提取数据:
size_t pos = line.find("t", 0);
string Chrom = line.substr(0,pos);
line.erase(0, pos +1);
当您可以使用stringstrem
更轻松地做到这一点时,这是一个无用的复杂功能。
您管理文件的方式并不十分清楚。这条线的目的是什么?
fstream checkmarkermap; checkmarkermap.open("Mapfile", std::fstream::out | std::fstream::trunc); checkmarkermap.close();
您打开和关闭该文件而不使用它...
然后,当您将数据写入 MapFile 时,您选择打开它(实际上重新打开而不关闭它(,在每次迭代时向文件附加新行。我认为这不是一个好主意,也不是对磁盘非常友好。我们谈论的是相对"小"的文件(最多 ~30000 行(和许多数据块很少的 IO 操作,最好读取整个输入文件,将数据存储在某个结构中,详细说明它(在 RAM 中(,然后写入输出文件。
请阅读并尝试以下程序,并告诉我是否有问题或您不明白的地方。
编辑 1
感谢您的澄清。我重写了我的代码,现在应该可以按您的预期工作。
我删除了所有 c++11 功能,因此它应该可以很好地使用 g++ 4.4.7 编译。
#include <iostream>
#include <string>
#include <vector>
#include <fstream>
#include <sstream>
#include <iomanip>
using std::cout;
using std::cerr;
using std::vector;
using std::string;
using std::stringstream;
// use a function to extract the first and third char from a string and then return
// a char that represents some genetic information
char get_geno_char( const string &str ) {
// to get the first and third chars in a string:
char hap1 = str[0];
char hap2 = str[2];
// chose the right value...
if (hap1 =='0' && hap2 =='0')
return '0';
else if (hap1 == '0' && hap2 =='1')
return '3';
// to be completed...
else
return '5';
}
// I'll use this class to store data read from a line of the input file
struct GenData {
string chr;
string pos;
string ref;
string alt;
// here I will store the chars related to dogs genome
vector<char> gen_dogs;
};
int main() {
vector<GenData> geno_data;
// If you are sure that there are ~30000 lines it may be better to reserve
// enough space to avoid unnecessary vector expansions
geno_data.reserve(40000);
char fname_in[] = "Chr38_3d_5snp.vfc";
std::ifstream infile(fname_in);
if ( infile.fail() ) {
cerr << "Error Opening File " << fname_in << 'n';
exit(-1);
}
// Read all the lines in the file and store the values in a vector of
// structs. I don't know if your actual input file has headers for every
// column like the snippet you showed earlier. In that case you can skip
// the first line or simply read it to find out the number of dogs
string line;
while ( std::getline(infile,line) ) {
// skip empty lines if there are any
if ( line.empty() ) continue;
GenData gd_temp;
// extract data from the line
// I'm not sure if you actually need those data or you can skip
// them to go the columns with dogs genome
stringstream ssline(line);
ssline >> gd_temp.chr >> gd_temp.pos >> gd_temp.ref >> gd_temp.alt;
// the input file may be wrong formatted or unreadable
if ( ssline.fail() ) break;
// extract dogs genome data from remaining columns
string temp;
while ( ssline >> temp ) {
// Instead of storing an entire string read from the input file
// like "0/0:42,0:42:PASS:98", we'll use the return value of
// get_geno_char() function to store the needed char
gd_temp.gen_dogs.push_back(get_geno_char(temp));
}
// the number of dogs is geno_dogs.size(). To be extra paranoid you
// can check if every line has the exact same amount of columns/dogs
geno_data.push_back(gd_temp);
}
infile.close();
// vectors (and strings) know their sizes... Please note that the actual
// size() is different (smaller) from the space reserved
size_t n_lines = geno_data.size();
if ( !n_lines ) {
// No line of data was read. Better to exit
cerr << "Insufficient data.";
exit(-1);
}
size_t n_dogs = geno_data[0].gen_dogs.size();
cout << n_lines << " lines of data was succesfully read from file: "
<< fname_in << "nFound genome data of " << n_dogs << " dogs.n";
// write Map.file
char fname_map[] = "Map.file";
std::ofstream mapfile(fname_map);
if ( mapfile.fail() ) {
cerr << "Error Opening File " << fname_map << 'n';
}
for ( size_t i = 0; i < n_lines; ++i ) {
mapfile << geno_data[i].chr << ' ' << geno_data[i].pos << ' '
<< geno_data[i].ref << ' ' << geno_data[i].alt << 'n';
}
mapfile.close();
// Now write the dogs genome
vector<string> dogs_genome;
// We know exactly how big those strings will be, so let's size them
// properly using resize(), not reserve() because we will not append
// chars to the strings, but insert them in the right position
dogs_genome.resize(n_dogs);
for ( size_t i = 0; i < n_dogs; ++i ) {
dogs_genome[i].resize(n_lines);
}
// leaving the lines loop outside and the dogs loop inside may be
// more cache friendly (faster), but I'm not really sure
for ( size_t i = 0; i < n_lines; ++i ) {
for ( size_t j = 0; j < n_dogs; ++j ) {
// geno_data[i] stores line i of data read
// gen_dogs[j] is the char needed in the i position of
// dog genome string j
dogs_genome[j][i] = geno_data[i].gen_dogs[j];
}
}
// write file Dogs.dat
char fname_dogs[] = "Dogs.dat";
std::ofstream dogfile(fname_dogs);
if ( mapfile.fail() ) {
cerr << "Error Opening File " << fname_dogs << 'n';
}
// you added an ID in the previous version of the question
for ( size_t i = 0; i < n_dogs; ++i ) {
// this will add an ID like D01 (you may have more then 9 dogs)
dogfile << 'D' << std::setfill('0') << std::setw(2) << ( i + 1 )
<< ' ' << dogs_genome[i] << 'n';
}
return 0;
}
运行该代码,我得到了两个输出文件,第一个"Map.file"包含:
chr38 12632 C T
chr38 13825 G T
chr38 17160 C T
chr38 17451 G A
chr38 19444 G A
第二个,"狗.dat"(希望(是你需要的:
D01 03000
D02 33033
D03 00000