Commit 2ecac0fb authored by Carsten Kemena's avatar Carsten Kemena

sorting fasta files

parent 4c1d106e
......@@ -32,11 +32,12 @@ RadiantDB::read(const fs::path &path)
fin.close();
}
/*
void
RadiantDB::build(const fs::path &inPath, bool forward, const fs::path &outPath)
{
/*std::ofstream fout(outPath, std::ios::out | std::ios::binary);
std::ofstream fout(outPath, std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
......@@ -50,11 +51,11 @@ RadiantDB::build(const fs::path &inPath, bool forward, const fs::path &outPath)
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();*/
}
fout.close();
}*/
short
char_diff(SuffixType val1, SuffixType val2)
RadiantDB::char_diff_(SuffixType val1, SuffixType val2)
{
SuffixType val = val1^val2;
short counter = 0;
......@@ -104,8 +105,8 @@ RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix, bool &p
else
{
short threshold = 2;
auto diff1 = char_diff(x, suffix.suffix);
auto diff2 = char_diff(it2->suffix.suffix, suffix.suffix);
auto diff1 = char_diff_(x, suffix.suffix);
auto diff2 = char_diff_(it2->suffix.suffix, suffix.suffix);
if ((diff1 < diff2) && (diff1 <= threshold))
return tmp;
else
......
......@@ -34,6 +34,9 @@ private:
void
close_();
short
static char_diff_(SuffixType val1, SuffixType val2);
public:
RadiantDB()
......@@ -53,8 +56,9 @@ public:
read(const fs::path &path);
void
/*void
build(const fs::path &inPath, bool forward, const fs::path &outPath);
*/
short
......
......@@ -92,7 +92,7 @@ cleanSuffixe(D &db)
auto currIt = nextIt++;
while (nextIt != endIt)
{
if (((currIt->second==0) && (prevIt->second==0)) || ((currIt->second == prevIt->second) && (currIt->second == nextIt->second))) //|| ((currIt->second != prevIt->second) && (currIt->second != nextIt->second) && (prevIt->second != nextIt->second)))
if (((currIt->second==0) && (prevIt->second==0)) || ((currIt->second == prevIt->second) && (currIt->second == nextIt->second)) || ((currIt->second != prevIt->second) && (currIt->second != nextIt->second) && (prevIt->second != nextIt->second)))
currIt = db.erase(currIt);
else
{
......@@ -103,7 +103,7 @@ cleanSuffixe(D &db)
}
}
/*
// remove first/last suffix if differnt from next/previous
if (db.size() > 1)
{
......@@ -124,7 +124,7 @@ cleanSuffixe(D &db)
// if only one element left its most liklely not very useful and can be removed
if (db.size() == 1)
db.clear();
*/
// test cleaning
if (db.size() > 1)
{
......@@ -211,7 +211,7 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
//#=GF AC PF10417.8
//A0A0L0HRL4_SPIPN/171-210 ...........SLQLGDRR..KV..ATPADWtGK.................GDEV.IIHN.G.V.S.N.E.E.A..A------rlfpg.....................
/*
AlgorithmPack::Input sequences(inFile);
std::string line;
std::smatch m;
......@@ -246,34 +246,47 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
}
}
families.emplace_back(start, seqSet.size());
/*
seqSet.read(inFile);
families.emplace_back(start, seqSet.size());*/
// *** BEGIN FASTA ***///
// rename sequences
std::smatch m;
std::regex re ("PF([0-9]{5})\\.");
std::string comment = seqSet[0].comment();
std::regex_search (comment,m,re);
std::string accession = m[1].str();
seqSet[0].name(accession);
size_t start = 0;
for (size_t i=1; i<seqSet.size(); ++i)
seqSet.read(inFile);
for (size_t i=0; i<seqSet.size(); ++i)
{
std::string comment = seqSet[i].comment();
std::regex_search (comment,m,re);
std::string tmp = m[1].str();
seqSet[i].name(tmp);
if (tmp != accession)
}
// sort sequnces just in case (PfamA.fasta should be sorted)
sort(seqSet.begin(), seqSet.end(),
[](const BSDL::Sequence<> & a, const BSDL::Sequence<> & b)
{
return a.name() < b.name();
}
);
std::string accession = seqSet[0].name();
size_t start = 0;
for (size_t i=1; i<seqSet.size(); ++i)
{
//std::string comment = seqSet[i].comment();
//std::regex_search (comment,m,re);
//std::string tmp = m[1].str();
//seqSet[i].name(tmp);
if (seqSet[i].name() != accession)
{
accession = std::move(tmp);
accession = seqSet[i].name();
families.emplace_back(start, i);
start = i;
}
}
families.emplace_back(start, seqSet.size());*/
families.emplace_back(start, seqSet.size());
// *** END FASTA ***///
std::cout << "Number of families: " << families.size() << std::endl;
const int windowSize = 18;
size_t famNumber = 0;
std::vector<D> threadDBs;
......
......@@ -291,8 +291,8 @@ main(int argc, char const *argv[])
//fs::path m, m2;
hiddenO.add_options()
("detailed", po::value<fs::path>(&detailedFile), "The output file for the detailed results")
("window_size", po::value<int>(&w), "The window size")
("match", po::value<int>(&m), "The number of matches required")
// ("window_size", po::value<int>(&w), "The window size")
// ("match", po::value<int>(&m), "The number of matches required")
// ("match2", po::value<fs::path>(&m2), "The number of matches required")
;
......@@ -412,10 +412,10 @@ main(int argc, char const *argv[])
for (size_t i = 0; i< seqSet.size(); ++i)
{
auto &assignment = assignments[i];
//auto da = words2arrangement(assignment, 10, 12);
auto da = words2arrangement(assignment, m, w);
//if (da.size() == 0)
// da = words2arrangement(assignment, 5, 7);
auto da = words2arrangement(assignment, 10, 12);
//auto da = words2arrangement(assignment, m, w);
if (da.size() == 0)
da = words2arrangement(assignment, 5, 7);
//auto &assignment = assignments[i];
//auto da = words2arrangement(assignment, domain2match, 10);
......
......@@ -14,7 +14,7 @@ BOOST_AUTO_TEST_SUITE(Encode_Test)
BOOST_AUTO_TEST_CASE(test)
{
auto alphabet = alphabet2bit;
//auto alphabet = alphabet2bit20;
std::string name = "";
std::string seqq = "ATSPGNDEQKRHYWFMLIVCg";
BioSeqDataLib::Sequence<> seq(name, seqq, "", "");
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment