Commit ed735a1d authored by Carsten Kemena's avatar Carsten Kemena

adding alignment information

- changed storing the results in logcial structs
- adapted test to slightly different order
parent ac2e56d3
Subproject commit cada6232b356db742926226965626bb7a567eb37
Subproject commit 9e07b28bc1fa57600877d964f7aea8bc740aedac
......@@ -22,6 +22,7 @@
#include "DBAccess.hpp"
using namespace std;
DBAccess::DBAccess(const fs::path &prefix, const fs::path &matrix)
......@@ -52,9 +53,10 @@ void DBAccess::open(const fs::path &prefix)
void
DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
std::multimap<std::pair<int,int>, RadsHit, comparePairs > &results)
DBAccess::search(BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
RadsQueryResult &results)
{
auto &queryDA = results.queryDA;
string query = "Select position from domain where accession in ('" + queryDA[0].accession() + "'";
size_t nDomains = queryDA.size();
for (size_t i=1; i<nDomains; ++i)
......@@ -102,21 +104,24 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
}
// calculate score for the domain arrangement
int score;
double normScore;
size_t targetLength = targetDA.size();
int order = targetDA.size();
if (collapse)
targetDA.collapse(true);
order -= targetDA.size();
matrix.gotoh(queryDA, targetDA);
score = matrix.score();
int score = matrix.score();
if (score < scoreThres)
continue;
// Store information in the Results
int minScore = (queryLength+targetLength) * matrix.gep();
normScore = min((score-minScore)*1.0/(queryLength*100-minScore),(score-minScore)*1.0/(targetLength*100-minScore));
// read all sequences having the same domain arrangement
double normScore = min((score-minScore)*1.0/(queryLength*100-minScore),(score-minScore)*1.0/(targetLength*100-minScore));
auto aln = matrix.result();
auto alnStrings = aln.alnStrings(queryDA, targetDA);
results.targets.emplace_back(std::move(targetDA), std::move(alnStrings.first), std::move(alnStrings.second), score, normScore);
// add all sequences having the same domain arrangement
while(getline(this->arrangementDB_, line))
{
if (line[0] =='>')
......@@ -127,7 +132,8 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
string name(line.begin(), line.begin()+pos);
char *tmp;
c_line += pos;
auto hitIt = results.emplace(std::piecewise_construct, std::forward_as_tuple(score, order), std::forward_as_tuple(name, normScore, strtoul(c_line, &tmp, 10)));
results.targets.back().targetSequences.emplace_back(std::move(name), strtoul(c_line, &tmp, 10), order);
auto &da = results.targets.back().targetSequences.back().da;
size_t start, end;
double eval;
c_line = tmp;
......@@ -139,8 +145,13 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
c_line = tmp;
eval = strtod(c_line, &tmp);
c_line = tmp;
hitIt->second.da.emplace_back(domains[i], start, end, eval);
da.emplace_back(domains[i], start, end, eval);
}
}
std::sort(results.targets.back().targetSequences.begin(), results.targets.back().targetSequences.end(), [](const TargetSequence &a, const TargetSequence &b) -> bool
{
return a.order < b.order;
});
}
results.sort();
}
......@@ -23,6 +23,8 @@
#ifndef DBACCESS_HPP
#define DBACCESS_HPP
#include <algorithm>
#include <vector>
#include <boost/filesystem.hpp>
// BioSeqDataLib
......@@ -57,28 +59,62 @@ struct RadsOptions
Algorithm algo;
};*/
struct RadsQueryResult
/**
* \brief Represents a single target sequence
*/
struct TargetSequence
{
BSDL::DomainArrangement<BSDL::Domain> queryDA;
std::string alnString1;
std::string alnString2;
std::string targetName; /// The sequence name
size_t length; /// The sequence length
int order;
BSDL::DomainArrangement<BSDL::Domain> da; /// The domain arrangement
TargetSequence() : targetName(""), length(0), order(0)
{}
TargetSequence(std::string t, size_t l, int o) : targetName(t), length(l), order(o)
{}
};
/**
* \brief A RADS hit representing a domain arrangement and all its sequences
*/
struct RadsHit
{
std::string targetName;
double normalized;
size_t length;
BSDL::DomainArrangement<BSDL::Domain> da;
BSDL::DomainArrangement<BSDL::Domain> da; /// The domain arrangement
std::string alnStringQuery; /// Alignment string of the query
std::string alnStringTarget; /// Alignment string og the target
int score; /// Score of the alignment
double normalized; /// normalized score of the alignment
std::vector<TargetSequence> targetSequences; /// The set of sequences having this arrangement
RadsHit() : targetName(""), normalized(0), length(0)
RadsHit(BSDL::DomainArrangement<BSDL::Domain> &&d, std::string &&alnStringQ, std::string &&alnStringT, int s, double n):
da(std::move(d)), alnStringQuery(std::move(alnStringQ)), alnStringTarget(std::move(alnStringT)), score(s), normalized(n)
{}
};
RadsHit(std::string t, double n, size_t l) : targetName(t), normalized(n), length(l)
{}
/**
* \brief Complete set of targets for the given query
*/
struct RadsQueryResult
{
std::string queryName;
BSDL::DomainArrangement<BSDL::Domain> queryDA; /// The query
std::vector<RadsHit> targets;
void
sort()
{
std::sort(this->targets.begin(), this->targets.end(), [](const RadsHit &a, const RadsHit &b) -> bool
{
return a.score > b.score;
});
}
};
......@@ -92,7 +128,7 @@ class DBAccess
public:
struct comparePairs {
/* struct comparePairs {
bool operator()(const std::pair<int, int>& lhs, const std::pair<int, int>& rhs)
{
if (lhs.first != rhs.first)
......@@ -100,7 +136,7 @@ class DBAccess
else
return lhs.second < rhs.second;
}
};
};*/
/**
* @brief Construct a new DBAccess object
*
......@@ -142,8 +178,8 @@ class DBAccess
* @param results The results sorted by score
*/
void
search(const BSDL::DomainArrangement<BSDL::Domain> &query, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
std::multimap<std::pair<int,int>, RadsHit, comparePairs > &results);
search(BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
RadsQueryResult &results);
};
......
......@@ -97,7 +97,7 @@ class DBCreator
* @param sequenceFile The sequence file
*/
void
readAnnotationFile(const fs::path &annotationFile, const fs::path &sequenceFile=0);
readAnnotationFile(const fs::path &annotationFile, const fs::path &sequenceFile="");
/**
* @brief Reads the interpro file.
......
......@@ -67,7 +67,7 @@ main(int argc, char *argv[])
("help,h", "Produces this help message")
("db,d", po::value<fs::path>(&prefix)->required(), "The database prefix")
("out,o", po::value<fs::path>(&outFile)->value_name("FILE"), "The output file")
("alignments,a", po::value<fs::path>(&alignmentFile)->value_name("FILE"), "File to write the alignments into")
("alignments,f", po::value<fs::path>(&alignmentFile)->value_name("FILE"), "File to write the alignments into")
("threads,n", po::value<unsigned short>(&nThreads)->default_value(1), "The number of threads to use")
;
......@@ -192,6 +192,7 @@ main(int argc, char *argv[])
string qPath = querySeqFile.empty() ? "-" : fs::canonical(querySeqFile).string();
string print_matrix_name = (print_filename_only) ? matrixName.stem().string() : fs::canonical(matrixName).string();
string print_db_name = (print_filename_only) ? prefix.stem().string() : fs::canonical(daFile).replace_extension("").string();
// print output header
AP::Output outS(outFile);
outS << "# RADS version " + version + "\n"
......@@ -215,11 +216,12 @@ main(int argc, char *argv[])
auto it=querySet.begin();
for (size_t j=0; j<i; ++j)
++it;
std::multimap<std::pair<int, int>, RadsHit, DBAccess::comparePairs > results;
db.search(it->second, matrices[omp_get_thread_num()], all, collapse, minScore, results);
RadsQueryResult results;
results.queryDA = it->second;
db.search(matrices[omp_get_thread_num()], all, collapse, minScore, results);
// results are written to a buffer first to reduce time in critical section
std::stringstream buf;
std::stringstream buf, alnbuf;
buf << "# -------------------------------------------------------------------\n";
buf << "Results for: " << it->first << "\n";
buf << "Domain arrangement:";
......@@ -227,26 +229,32 @@ main(int argc, char *argv[])
buf << " " << domain.accession();
buf << "\n\n";
buf << "# score | normalized | SeqID | sequence length | domain arrangement\n";
buf << "# score | normalized | SeqID | sequence length | domain arrangement | aln\n";
buf << "# -------------------------------------------------------------------\n";
buf.setf(ios::fixed, ios::floatfield);
buf.precision(2);
for (auto &result : results)
size_t alnNumber = 0;
for (auto &hit : results.targets)
{
const auto &hit = result.second;
buf << result.first.first << "\t" << hit.normalized << "\t" << hit.targetName << "\t" << hit.length << "\t";
const auto &da = hit.da;
size_t len = da.size();
buf << da[0].accession() << " " << da[0].start() << " " << da[0].end();
for (size_t i = 1; i<len; ++i)
buf << " " << da[i].accession() << " " << da[i].start() << " " << da[i].end();
buf << "\n";
++alnNumber;
alnbuf << std::to_string(alnNumber) + "\n" + hit.alnStringQuery + "\n" + hit.alnStringTarget + "\n\n";
for (auto &seq : hit.targetSequences)
{
//const auto &hit = result.;
buf << hit.score << "\t" << hit.normalized << "\t" << seq.targetName << "\t" << seq.length << "\t";
const auto &da = seq.da;
size_t len = da.size();
buf << da[0].accession() << " " << da[0].start() << " " << da[0].end();
for (size_t i = 1; i<len; ++i)
buf << " " << da[i].accession() << " " << da[i].start() << " " << da[i].end();
buf << "\n";
}
}
buf << "\n\n";
#pragma omp critical
outS << buf.rdbuf();
std::cout << alnbuf.rdbuf();
}
outS.close();
......
# RADS version 2.2.0
# RADS Output v1
# run at Fri Apr 20 14:19:09 2018
# run at Fri Jun 22 17:06:20 2018
#
# query file: -
# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test
......@@ -15,11 +15,11 @@
Results for: manual entered query
Domain arrangement: PF00001 PF00002 PF00003
# score | normalized | SeqID | sequence length | domain arrangement
# score | normalized | SeqID | sequence length | domain arrangement | aln
# -------------------------------------------------------------------
300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524
190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524
190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment