Commit 777a0cdb authored by Carsten Kemena's avatar Carsten Kemena

[DomainArrangment] added abstract representation function

This collapses repeats favouring larger repeats over shorter ones in case of conflict. Repeats inside Repeats are not allowed: ABABABAB will be 4*AB not 2*ABAB
parent da247ecf
......@@ -11,6 +11,40 @@
namespace BioSeqDataLib
{
template<>
std::vector<AbstractDomainRepeat>
DomainArrangement<Domain>::abstractRepresentation()
{
auto repList = listRepeats();
std::vector<AbstractDomainRepeat> result;
unsigned int start = 0;
if (!repList.empty())
{
for (auto elem : repList)
{
for (auto i = start; i<elem.start; ++i)
{
result.emplace_back(domains_[i].accession());
}
result.emplace_back(AbstractDomainRepeat());
result.back().times = elem.repeatTimes;
for (auto j=elem.start; j<elem.start+elem.length; ++j)
{
result.back().accessions.push_back(domains_[j].accession());
}
start = elem.start + elem.length*elem.repeatTimes;
}
}
for (auto i = start; i<domains_.size(); ++i)
{
result.emplace_back(domains_[i].accession());
}
return result;
}
std::map<std::string, int>
clanCounts(const DomainArrangement<PfamDomain> &daSet, bool useDomain)
{
......
......@@ -37,6 +37,7 @@
#include <vector>
#include <iostream>
#include <math.h>
#include <stack>
#include "PfamDomain.hpp"
......@@ -51,6 +52,33 @@ namespace BioSeqDataLib
* @{
*/
struct RepeatElement
{
unsigned int repeatTimes;
unsigned int start;
unsigned int length;
RepeatElement(size_t i, unsigned int s, unsigned int l) : repeatTimes(i), start(s), length(l)
{
}
};
struct AbstractDomainRepeat
{
int times;
std::vector<std::string> accessions;
std::vector<std::string> names;
AbstractDomainRepeat() : times(1)
{}
AbstractDomainRepeat(std::string acc) : times(1), accessions({acc})
{}
AbstractDomainRepeat(std::string acc, std::string name) : times(1), accessions({acc}), names({name})
{}
};
/**
* \brief Class to store a domain arrangement.
......@@ -66,6 +94,109 @@ private:
std::map<unsigned int, DomainType> collapsedDoms_;
using Domain_t = std::vector<DomainType>;
bool
isTandemRepeat_(size_t start, size_t length)
{
assert(start > 0);
assert(length > 0);
assert(start*2*length < domains_.size());
auto stop = start+length;
for (size_t i = start; i< stop; ++i)
{
if (domains_[i].accession() != domains_[i+length].accession())
{
return false;
}
}
return true;
}
bool
consistsOfSubRpeats_(size_t start, size_t length)
{
for (unsigned int l = length/2; l>0; --l)
{
if (length % l == 0)
{
bool isRepeat = true;
unsigned int n = length/l;
for (unsigned int j = 0; j<n; ++j)
{
if (!isTandemRepeat_(start+j*l, l))
{
isRepeat=false;
break;
}
}
if (isRepeat)
return isRepeat;
}
}
return false;
}
std::vector<RepeatElement>
findRepeats_(unsigned int start, unsigned int end)
{
std::stack<std::pair<unsigned int, unsigned int> > intervals;
intervals.emplace(start, end);
std::vector<RepeatElement> representation;
while (!intervals.empty())
{
auto x = intervals.top();
unsigned int length = (x.second - x.first)/2;
intervals.pop();
bool stop = false;
while (length > 0)
{
auto stop_pos = x.second-2*length;
unsigned int i = x.first;
while (i <= stop_pos)
{
if (isTandemRepeat_(i, length))
{
if ((length == 1) || (!consistsOfSubRpeats_(i, length)))
{
unsigned int tmp = i+length;
unsigned int repNumber = 2;
while (tmp +2*length <=x.second)
{
if (isTandemRepeat_(tmp, length))
{
++repNumber;
tmp += length;
}
else
break;
}
representation.emplace_back(repNumber, i, length);
if (i-x.first > 1)
{
intervals.emplace(x.first, i);
}
if (x.second-(i+length*repNumber) > 1)
{
intervals.emplace(i+length*repNumber, x.second);
}
stop = true;
break;
}
}
++i;
}
if (stop)
break;
--length;
}
}
return representation;
}
public:
using iterator = typename Domain_t::iterator;
using const_iterator = typename Domain_t::const_iterator;
......@@ -470,6 +601,34 @@ public:
}
}
/**
* @brief Produces a set of non-overlapping Repeats.
*
* @return std::vector<RepeatElement> The repeats found
*/
std::vector<RepeatElement>
listRepeats()
{
std::vector<RepeatElement> representation = findRepeats_(0, domains_.size());
if (!representation.empty())
{
sort(representation.begin(), representation.end(),
[](const RepeatElement & a, const RepeatElement & b)
{
return a.start < b.start;
});
}
return representation;
}
std::vector<AbstractDomainRepeat>
abstractRepresentation();
/**
* \brief Reconstructs the original domain arrangement.
* This works only if the collapse function has been called with reversable set to true.
......@@ -655,6 +814,44 @@ public:
template<>
std::vector<AbstractDomainRepeat>
DomainArrangement<Domain>::abstractRepresentation();
template<typename D>
std::vector<AbstractDomainRepeat>
DomainArrangement<D>::abstractRepresentation()
{
auto repList = listRepeats();
std::vector<AbstractDomainRepeat> result;
unsigned int start = 0;
if (!repList.empty())
{
for (auto elem : repList)
{
for (auto i = start; i<elem.start; ++i)
{
result.emplace_back(domains_[i].accession(), domains_[i].name());
}
result.emplace_back(AbstractDomainRepeat());
result.back().times = elem.repeatTimes;
for (auto j=elem.start; j<elem.start+elem.length; ++j)
{
result.back().accessions.push_back(domains_[j].accession());
result.back().names.push_back(domains_[j].name());
}
start = elem.start + elem.length*elem.repeatTimes;
}
}
for (auto i = start; i<domains_.size(); ++i)
{
result.emplace_back(domains_[i].accession(), domains_[i].name());
}
return result;
}
/**
*
* \brief Counts the occurrence of each clan an a domain set.
......
......@@ -264,6 +264,131 @@ BOOST_AUTO_TEST_CASE(Collapse_check)
BOOST_CHECK_EQUAL(da3[1].accession(), "PF00008");
BOOST_CHECK_EQUAL(da3.size(), 4);
BOOST_CHECK_EQUAL(da3.str(), "PF00007-PF00008-PF00009-PF00010");
da3.reconstruct();
BOOST_CHECK_EQUAL(da3.size(), 6);
BOOST_CHECK_EQUAL(da3[0].accession(), "PF00007");
BOOST_CHECK_EQUAL(da3[1].accession(), "PF00008");
BOOST_CHECK_EQUAL(da3[1].start(), 100);
BOOST_CHECK_EQUAL(da3[2].accession(), "PF00008");
BOOST_CHECK_EQUAL(da3[3].accession(), "PF00009");
BOOST_CHECK_EQUAL(da3[4].accession(), "PF00010");
BOOST_CHECK_EQUAL(da3[5].accession(), "PF00010");
BOOST_CHECK_EQUAL(da3[5].start(), 300000);
}
BOOST_AUTO_TEST_CASE(short_rep_test)
{
BioSeqDataLib::DomainArrangement<BioSeqDataLib::Domain> da;
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 11, 20, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
auto x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 2);
auto y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 1);
BOOST_CHECK_EQUAL(y[0].times, 2);
BOOST_CHECK_EQUAL(y[0].accessions[0], "A");
BOOST_CHECK_EQUAL(y[0].accessions[1], "C");
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 3);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 2);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 4);
da.clear();
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 11, 20, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 2);
BOOST_CHECK_EQUAL(x[0].start, 0);
BOOST_CHECK_EQUAL(x[0].length, 1);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 3);
BOOST_CHECK_EQUAL(x[1].start, 3);
BOOST_CHECK_EQUAL(x[1].length, 2);
BOOST_CHECK_EQUAL(x[1].repeatTimes, 2);
y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 2);
BOOST_CHECK_EQUAL(y[0].times, 3);
BOOST_CHECK_EQUAL(y[0].accessions[0], "A");
BOOST_CHECK_EQUAL(y[1].times, 2);
BOOST_CHECK_EQUAL(y[1].accessions[0], "A");
BOOST_CHECK_EQUAL(y[1].accessions[1], "C");
da.clear();
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("B", 11, 20, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 200, 201, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("B", 1000, 3000, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 10000, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("T", 10000, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("T", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("T", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("T", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 100, 101, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 1);
BOOST_CHECK_EQUAL(x[0].start, 5);
BOOST_CHECK_EQUAL(x[0].length, 4);
BOOST_CHECK_EQUAL(x[0].repeatTimes, 2);
da.clear();
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("B", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("B", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("C", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("E", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("F", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("G", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("F", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("G", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("A", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
da.emplace_back("K", 1, 19, 0.4, BioSeqDataLib::DomainDB::pfam);
x = da.listRepeats();
BOOST_CHECK_EQUAL(x.size(), 2);
y = da.abstractRepresentation();
BOOST_CHECK_EQUAL(y.size(), 6);
BOOST_CHECK_EQUAL(y[0].times, 1);
BOOST_CHECK_EQUAL(y[0].accessions[0], "A");
BOOST_CHECK_EQUAL(y[1].times, 2);
BOOST_CHECK_EQUAL(y[1].accessions[0], "B");
BOOST_CHECK_EQUAL(y[1].accessions[1], "C");
BOOST_CHECK_EQUAL(y[2].times, 1);
BOOST_CHECK_EQUAL(y[2].accessions[0], "E");
BOOST_CHECK_EQUAL(y[3].times, 2);
BOOST_CHECK_EQUAL(y[3].accessions[0], "F");
BOOST_CHECK_EQUAL(y[3].accessions[1], "G");
BOOST_CHECK_EQUAL(y[4].times, 1);
BOOST_CHECK_EQUAL(y[4].accessions[0], "A");
BOOST_CHECK_EQUAL(y[5].times, 1);
BOOST_CHECK_EQUAL(y[5].accessions[0], "K");
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment