Commit f4e3e912 authored by Carsten Kemena's avatar Carsten Kemena

improved abstract representation

parent 77b3f825
......@@ -96,7 +96,7 @@ set(phylogenyCPP PhylogeneticTree.cpp fitch.cpp dollo.cpp)
PREPEND(phylogenyCPP "${CMAKE_CURRENT_SOURCE_DIR}/src/phylogeny" ${phylogenyCPP})
# utility module
set(utilityCPP TwoValues.cpp IdentityMatrix.cpp DSM.cpp stringHelpers.cpp properties.cpp utility.cpp Settings.cpp)
set(utilityCPP algorithm.cpp TwoValues.cpp IdentityMatrix.cpp DSM.cpp stringHelpers.cpp properties.cpp utility.cpp Settings.cpp)
PREPEND(utilityCPP "${CMAKE_CURRENT_SOURCE_DIR}/src/utility" ${utilityCPP})
#external module
......
......@@ -7,6 +7,7 @@
#include "DomainArrangement.hpp"
#include "../utility/algorithm.hpp"
namespace BioSeqDataLib
{
......@@ -16,24 +17,40 @@ template<>
std::vector<AbstractDomainRepeat>
DomainArrangement<Domain>::abstractRepresentation()
{
auto repList = listRepeats();
auto repList = this->findAllRepeats_();
sort(repList.begin(), repList.end(), [](const RepeatElement & a, const RepeatElement & b) -> bool
{
return (a.start+a.repeatTimes*a.length) < (b.start+b.repeatTimes*b.length);
});
std::vector<Interval> intervals;
for (auto const &elem : repList)
{
intervals.emplace_back(elem.start, elem.start+elem.repeatTimes*elem.length-1, elem.repeatTimes*elem.length);
}
auto subset = max_weight_subset(intervals);
std::vector<AbstractDomainRepeat> result;
unsigned int start = 0;
if (!repList.empty())
if (!subset.empty())
{
for (auto elem : repList)
for (auto id : subset)
{
for (auto i = start; i<elem.start; ++i)
for (auto i = start; i<repList[id].start; ++i)
{
result.emplace_back(domains_[i].accession());
}
result.emplace_back(AbstractDomainRepeat());
result.back().times = elem.repeatTimes;
for (auto j=elem.start; j<elem.start+elem.length; ++j)
result.back().times = repList[id].repeatTimes;
for (auto j=repList[id].start; j<repList[id].start+repList[id].length; ++j)
{
result.back().accessions.push_back(domains_[j].accession());
}
start = elem.start + elem.length*elem.repeatTimes;
start = repList[id].start + repList[id].length*repList[id].repeatTimes;
}
}
......@@ -41,6 +58,7 @@ DomainArrangement<Domain>::abstractRepresentation()
{
result.emplace_back(domains_[i].accession());
}
return result;
}
......
......@@ -42,6 +42,7 @@
#include "PfamDomain.hpp"
// BioSeqDataLib header
#include "../utility/algorithm.hpp"
#include "../utility/DSM.hpp"
#include "DomainExt.hpp"
......@@ -113,7 +114,7 @@ private:
}
bool
consistsOfSubRpeats_(size_t start, size_t length)
consistsOfSubRepeats_(size_t start, size_t length)
{
for (unsigned int l = length/2; l>0; --l)
{
......@@ -137,67 +138,46 @@ private:
}
std::vector<RepeatElement>
findRepeats_(unsigned int start, unsigned int end)
findAllRepeats_()
{
std::stack<std::pair<unsigned int, unsigned int> > intervals;
intervals.emplace(start, end);
std::vector<RepeatElement> representation;
while (!intervals.empty())
std::vector<RepeatElement> repeatList;
auto n_domains = domains_.size();
for (size_t start =0; start<n_domains; ++start)
{
auto x = intervals.top();
unsigned int length = (x.second - x.first)/2;
intervals.pop();
bool stop = false;
while (length > 0)
auto max_length = (n_domains-start)/2;
for (size_t length=1; length<=max_length; ++length)
{
auto stop_pos = x.second-2*length;
unsigned int i = x.first;
while (i <= stop_pos)
if (isTandemRepeat_(start, length))
{
if (isTandemRepeat_(i, length))
if (!consistsOfSubRepeats_(start, length))
{
if ((length == 1) || (!consistsOfSubRpeats_(i, length)))
repeatList.emplace_back(2, start, length);
unsigned int tmp = start+length;
unsigned int repNumber = 2;
while (tmp +2*length <= n_domains)
{
unsigned int tmp = i+length;
unsigned int repNumber = 2;
while (tmp +2*length <=x.second)
{
if (isTandemRepeat_(tmp, length))
{
++repNumber;
tmp += length;
}
else
break;
}
representation.emplace_back(repNumber, i, length);
if (i-x.first > 1)
if (isTandemRepeat_(tmp, length))
{
intervals.emplace(x.first, i);
++repNumber;
repeatList.emplace_back(repNumber, start, length);
tmp += length;
}
if (x.second-(i+length*repNumber) > 1)
{
intervals.emplace(i+length*repNumber, x.second);
}
stop = true;
break;
else
break;
}
}
++i;
}
if (stop)
break;
--length;
}
}
return representation;
return repeatList;
}
public:
public:
using iterator = typename Domain_t::iterator;
using const_iterator = typename Domain_t::const_iterator;
using reverse_iterator = typename Domain_t::reverse_iterator;
......@@ -608,7 +588,7 @@ public:
*
* @return std::vector<RepeatElement> The repeats found
*/
std::vector<RepeatElement>
/*std::vector<RepeatElement>
listRepeats()
{
std::vector<RepeatElement> representation = findRepeats_(0, domains_.size());
......@@ -621,7 +601,7 @@ public:
});
}
return representation;
}
}*/
......@@ -818,29 +798,45 @@ template<>
std::vector<AbstractDomainRepeat>
DomainArrangement<Domain>::abstractRepresentation();
template<typename D>
std::vector<AbstractDomainRepeat>
DomainArrangement<D>::abstractRepresentation()
{
auto repList = listRepeats();
auto repList = this->findAllRepeats_();
sort(repList.begin(), repList.end(), [](const RepeatElement & a, const RepeatElement & b) -> bool
{
return (a.start+a.repeatTimes*a.length) < (b.start+b.repeatTimes*b.length);
});
std::vector<Interval> intervals;
for (auto const &elem : repList)
{
intervals.emplace_back(elem.start, elem.start+elem.repeatTimes*elem.length-1, elem.repeatTimes*elem.length);
}
auto subset = max_weight_subset(intervals);
std::vector<AbstractDomainRepeat> result;
unsigned int start = 0;
if (!repList.empty())
if (!subset.empty())
{
for (auto elem : repList)
for (auto id : subset)
{
for (auto i = start; i<elem.start; ++i)
for (auto i = start; i<repList[id].start; ++i)
{
result.emplace_back(domains_[i].accession(), domains_[i].name());
result.emplace_back(domains_[i].accession());
}
result.emplace_back(AbstractDomainRepeat());
result.back().times = elem.repeatTimes;
for (auto j=elem.start; j<elem.start+elem.length; ++j)
result.back().times = repList[id].repeatTimes;
for (auto j=repList[id].start; j<repList[id].start+repList[id].length; ++j)
{
result.back().accessions.push_back(domains_[j].accession());
result.back().names.push_back(domains_[j].name());
}
start = elem.start + elem.length*elem.repeatTimes;
start = repList[id].start + repList[id].length*repList[id].repeatTimes;
}
}
......
......@@ -284,11 +284,11 @@ BOOST_AUTO_TEST_CASE(short_rep_test)
da.emplace_back("C", 11, 20, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
auto x = da.listRepeats();
/*auto x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 2);*/
auto y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 1);
BOOST_CHECK_EQUAL(y[0].times, 2);
......@@ -297,19 +297,19 @@ BOOST_AUTO_TEST_CASE(short_rep_test)
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
/*x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 3);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 3);*/
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
/*x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 4);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 4);*/
da.clear();
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
......@@ -319,14 +319,14 @@ BOOST_AUTO_TEST_CASE(short_rep_test)
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
/*x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 2);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 1);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 3);
BOOST_CHECK_EQUAL(x[1].start, 3);
BOOST_CHECK_EQUAL(x[1].length, 2);
BOOST_CHECK_EQUAL(x[1].repeatTimes, 2);
BOOST_CHECK_EQUAL(x[1].repeatTimes, 2);*/
y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 2);
BOOST_CHECK_EQUAL(y[0].times, 3);
......@@ -350,11 +350,11 @@ BOOST_AUTO_TEST_CASE(short_rep_test)
da.emplace_back("T", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
/*x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 5);
BOOST_CHECK_EQUAL(x[0].length, 4);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 2);
BOST_CHECK_EQUAL(x[0].repeatTimes, 2);*/
da.clear();
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
......@@ -369,8 +369,10 @@ BOOST_AUTO_TEST_CASE(short_rep_test)
da.emplace_back("G", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("K", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 2);
//da.findAllRepeats_();
/*x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 2);*/
y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 6);
BOOST_CHECK_EQUAL(y[0].times, 1);
......@@ -389,6 +391,37 @@ BOOST_AUTO_TEST_CASE(short_rep_test)
BOOST_CHECK_EQUAL(y[5].accessions[0], "K");
da.clear();
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("B", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("B", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 2);
BOOST_CHECK_EQUAL(y[0].times, 3);
BOOST_CHECK_EQUAL(y[0].accessions[0], "A");
BOOST_CHECK_EQUAL(y[1].times, 2);
BOOST_CHECK_EQUAL(y[1].accessions[0], "A");
BOOST_CHECK_EQUAL(y[1].accessions[1], "B");
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 2);
BOOST_CHECK_EQUAL(y[0].times, 4);
BOOST_CHECK_EQUAL(y[0].accessions[0], "A");
BOOST_CHECK_EQUAL(y[1].times, 2);
BOOST_CHECK_EQUAL(y[1].accessions[0], "B");
BOOST_CHECK_EQUAL(y[1].accessions[1], "A");
//da.emplace_back("G", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
//da.emplace_back("F", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
//da.emplace_back("G", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
//da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
//da.emplace_back("K", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
}
......
......@@ -29,6 +29,7 @@
#include <boost/test/unit_test.hpp>
#include "algorithm_Test.hpp"
#include "Matrix_Test.hpp"
#include "MatrixStack_Test.hpp"
#include "SimilarityMatrix_Test.hpp"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment