Commit b19337ba authored by Carsten Kemena's avatar Carsten Kemena

fixed sorting my collapse state

parent 45c6d419
......@@ -51,6 +51,38 @@ void DBAccess::open(const fs::path &prefix)
arrangementDB_.open(dbFile);
}
void
DBAccess::readSequences_(RadsQueryResult &results, const std::vector<std::string> &domainNames)
{
size_t nDomains = domainNames.size();
std::string line;
while(getline(this->arrangementDB_, line))
{
if (line[0] =='>')
break;
const char *c_line = line.c_str();
size_t pos = line.find(' ');
string name(line.begin(), line.begin()+pos);
char *tmp;
c_line += pos;
results.targets.back().targetSequences.emplace_back(std::move(name), strtoul(c_line, &tmp, 10));
auto &da = results.targets.back().targetSequences.back().da;
size_t start, end;
double eval;
c_line = tmp;
for (size_t i=0; i<nDomains; ++i)
{
start = strtoul(c_line, &tmp, 10);
c_line = tmp;
end = strtoul(c_line, &tmp, 10);
c_line = tmp;
eval = strtod(c_line, &tmp);
c_line = tmp;
da.emplace_back(domainNames[i], start, end, eval);
}
}
}
void
DBAccess::search(BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
......@@ -81,7 +113,6 @@ DBAccess::search(BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool c
this->arrangementDB_.seekg(pos, std::ios_base::beg);
getline(this->arrangementDB_, line);
auto domains = BSDL::split(line, ">;");
size_t nDomains = domains.size();
BSDL::DomainArrangement<BSDL::Domain> targetDA;
for (auto &domain : domains)
targetDA.emplace_back(domain, 0, 0, 0);
......@@ -105,54 +136,26 @@ DBAccess::search(BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool c
}
// calculate score for the domain arrangement
int order = targetDA.size();
int priority = targetDA.size();
if (collapse)
targetDA.collapse(true);
size_t targetLength = targetDA.size();
order -= targetLength;
matrix.gotoh(queryDA, targetDA);
int score = matrix.score();
if (score < scoreThres)
continue;
size_t targetLength = targetDA.size();
priority -= targetLength;
// Store information in the Results
int minScore = (queryLength+targetLength) * matrix.gep();
double normScore = min((score-minScore)*1.0/(queryLength*100-minScore),(score-minScore)*1.0/(targetLength*100-minScore));
auto aln = matrix.result();
auto alnStrings = aln.alnStrings(queryDA, targetDA);
results.targets.emplace_back(std::move(targetDA), std::move(alnStrings.first), std::move(alnStrings.second), score, normScore);
results.targets.emplace_back(std::move(targetDA), std::move(alnStrings.first), std::move(alnStrings.second), score, normScore, priority);
// add all sequences having the same domain arrangement
while(getline(this->arrangementDB_, line))
{
if (line[0] =='>')
break;
const char *c_line = line.c_str();
size_t pos = line.find(' ');
string name(line.begin(), line.begin()+pos);
char *tmp;
c_line += pos;
results.targets.back().targetSequences.emplace_back(std::move(name), strtoul(c_line, &tmp, 10), order);
auto &da = results.targets.back().targetSequences.back().da;
size_t start, end;
double eval;
c_line = tmp;
for (size_t i=0; i<nDomains; ++i)
{
start = strtoul(c_line, &tmp, 10);
c_line = tmp;
end = strtoul(c_line, &tmp, 10);
c_line = tmp;
eval = strtod(c_line, &tmp);
c_line = tmp;
da.emplace_back(domains[i], start, end, eval);
}
}
std::sort(results.targets.back().targetSequences.begin(), results.targets.back().targetSequences.end(), [](const TargetSequence &a, const TargetSequence &b) -> bool
{
return a.order < b.order;
});
readSequences_(results, domains);
queryDA.reconstruct();
}
results.sort();
......
......@@ -39,26 +39,6 @@
namespace BSDL = BioSeqDataLib;
namespace fs = boost::filesystem;
/*
struct RadsOptions
{
enum Algorithm
{
NW, NW_RASPODOM, GOTOH, GOTOH_RASPODOM
};
// scoring options
BSDL::DSM similarityMatrix;
int gop;
int gep;
// domain content options
bool all;
bool collapse;
Algorithm algo;
};*/
/**
* \brief Represents a single target sequence
*/
......@@ -66,14 +46,13 @@ struct TargetSequence
{
std::string targetName; /// The sequence name
size_t length; /// The sequence length
int order;
BSDL::DomainArrangement<BSDL::Domain> da; /// The domain arrangement
TargetSequence() : targetName(""), length(0), order(0)
TargetSequence() : targetName(""), length(0)
{}
TargetSequence(std::string t, size_t l, int o) : targetName(t), length(l), order(o)
TargetSequence(std::string t, size_t l) : targetName(t), length(l)
{}
};
......@@ -88,16 +67,16 @@ struct RadsHit
int score; /// Score of the alignment
double normalized; /// normalized score of the alignment
int priority; /// Helps sorting the DAs if alignment score is equal. Low value, means less collapsing, listed first
std::vector<TargetSequence> targetSequences; /// The set of sequences having this arrangement
RadsHit(BSDL::DomainArrangement<BSDL::Domain> &&d, std::string &&alnStringQ, std::string &&alnStringT, int s, double n):
da(std::move(d)), alnStringQuery(std::move(alnStringQ)), alnStringTarget(std::move(alnStringT)), score(s), normalized(n)
RadsHit(BSDL::DomainArrangement<BSDL::Domain> &&d, std::string &&alnStringQ, std::string &&alnStringT, int s, double n, int p):
da(std::move(d)), alnStringQuery(std::move(alnStringQ)), alnStringTarget(std::move(alnStringT)), score(s), normalized(n), priority(p)
{}
};
/**
* \brief Complete set of targets for the given query
*/
......@@ -112,13 +91,15 @@ struct RadsQueryResult
{
std::sort(this->targets.begin(), this->targets.end(), [](const RadsHit &a, const RadsHit &b) -> bool
{
return a.score > b.score;
if (a.score != b.score)
return a.score > b.score;
else // if score is equal list non collapsed domain arrangements first
return a.priority < b.priority;
});
}
};
class DBAccess
{
private:
......@@ -126,17 +107,11 @@ class DBAccess
AlgorithmPack::Input arrangementDB_; // The file object containing the domain arrangements
BSDL::DSM similarityMatrix_; // The similarity matrix to be used.
void
readSequences_(RadsQueryResult &results, const std::vector<std::string> &vectorNames);
public:
/* struct comparePairs {
bool operator()(const std::pair<int, int>& lhs, const std::pair<int, int>& rhs)
{
if (lhs.first != rhs.first)
return lhs.first > rhs.first;
else
return lhs.second < rhs.second;
}
};*/
/**
* @brief Construct a new DBAccess object
*
......
# RADS version 2.3.0
# RADS Output v1
# run at Thu Jun 28 17:15:52 2018
# run at Fri Jun 29 09:02:21 2018
#
# query file: -
# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/annotation
......@@ -17,8 +17,8 @@ Domain arrangement: PF02543 PF16861
# score | normalized | SeqID | sequence length | domain arrangement | aln
# -------------------------------------------------------------------
200 1.00 A0A009 530 PF02543 9 62 PF02543 103 311 PF16861 361 523 1
200 1.00 A0A010 530 PF02543 9 62 PF16861 361 523 2
200 1.00 A0A010 530 PF02543 9 62 PF16861 361 523 1
200 1.00 A0A009 530 PF02543 9 62 PF02543 103 311 PF16861 361 523 2
# -------------------------------------------------------------------
......
# RADS version 2.2.0
# RADS version 2.3.0
# RADS Output v1
# run at Fri Jun 22 17:06:20 2018
# run at Fri Jun 29 09:02:21 2018
#
# query file: -
# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test
......@@ -15,11 +15,11 @@
Results for: manual entered query
Domain arrangement: PF00001 PF00002 PF00003
# score | normalized | SeqID | sequence length | domain arrangement | aln
# score | normalized | SeqID | sequence length | domain arrangement
# -------------------------------------------------------------------
300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524
190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524
190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment