Commit d2fb1494 authored by Carsten Kemena's avatar Carsten Kemena

changing to stockholm input

parent b9f2bf63
Subproject commit eb77e85ca54c9c06fbc50f004667404e5fd0a763
Subproject commit cc31689acbf2de5870ac62757f4f0141ce247e83
......@@ -14,7 +14,7 @@
// BSDL header
#include "../libs/BioSeqDataLib/src/external/Input.hpp"
#include "../libs/BioSeqDataLib/src/external/Output.hpp"
#include "../libs/BioSeqDataLib/src/utility/stringHelpers.hpp"
#include "common.hpp"
#include "version.hpp"
......@@ -206,6 +206,48 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
{
std::vector<std::pair<size_t,size_t> > families;
BSDL::SequenceSet<BSDL::Sequence<> > seqSet;
//Stockholm
//#=GF AC PF10417.8
//A0A0L0HRL4_SPIPN/171-210 ...........SLQLGDRR..KV..ATPADWtGK.................GDEV.IIHN.G.V.S.N.E.E.A..A------rlfpg.....................
AlgorithmPack::Input sequences(inFile);
std::string line;
std::smatch m;
std::regex re ("PF([0-9]{5})\\.");
std::string accession = "";
std::string gap_chars = ".-";
size_t start = 0;
while (getline(sequences, line))
{
//std::cout << line << std::endl;
// line is alignment
if ((line[0] != '#') && (line[0] != '/'))
{
auto tokens = BSDL::split(line, " ");
for (auto gap_char : gap_chars)
tokens[1].erase (std::remove(tokens[1].begin(), tokens[1].end(), gap_char), tokens[1].end());
std::transform(tokens[1].begin(), tokens[1].end(),tokens[1].begin(), ::toupper);
seqSet.emplace_back(accession, tokens[1], "", "");
}
else
{
if (line.compare(0, 7, "#=GF AC") == 0)
{
std::regex_search (line, m, re);
accession = m[1].str();
if (seqSet.size() != 0)
{
families.emplace_back(start, seqSet.size());
start = seqSet.size();
}
}
}
}
families.emplace_back(start, seqSet.size());
/*
seqSet.read(inFile);
std::smatch m;
......@@ -228,7 +270,7 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
start = i;
}
}
families.emplace_back(start, seqSet.size());
families.emplace_back(start, seqSet.size());*/
std::cout << "Number of families: " << families.size() << std::endl;
......
......@@ -386,10 +386,11 @@ main(int argc, char const *argv[])
//fs::path tmp = outFile;
// tmp += to_string(max_dist) + "_" + to_string(min_count) + ".txt";
//
map<unsigned short, int> domain2match, domain2match2;
/* map<unsigned short, int> domain2match, domain2match2;
std::ifstream file(m.string());
unsigned short p_id;
int threshold;
while(!file.eof())
{
file >> p_id >> threshold; // extracts 2 floating point values seperated by whitespace
......@@ -411,7 +412,12 @@ main(int argc, char const *argv[])
for (size_t i = 0; i< seqSet.size(); ++i)
{
auto &assignment = assignments[i];
auto da = words2arrangement(assignment, domain2match, 10);
auto da = words2arrangement(assignment, 10, 12);
if (da.size() == 0)
da = words2arrangement(assignment, 5, 7);
//auto &assignment = assignments[i];
//auto da = words2arrangement(assignment, domain2match, 10);
//if (da.size() == 0)
// da = words2arrangement(assignment, domain2match2, 5);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment