Commit 1e45b525 authored by Carsten Kemena's avatar Carsten Kemena

[RADS] added collapse options

parent 29002f00
......@@ -52,8 +52,8 @@ void DBAccess::open(const fs::path &prefix)
void
DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, int scoreThres,
std::multimap<int, RadsHit, std::greater<int> > &results)
DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
std::multimap<std::pair<int,int>, RadsHit, comparePairs > &results)
{
string query = "Select position from domain where accession in ('" + queryDA[0].accession() + "'";
size_t nDomains = queryDA.size();
......@@ -71,7 +71,6 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
size_t queryLength = queryDA.size();
// read the arrangements from file
//BSDL::DomainArrangementSet<BSDL::Domain> targetDAs;
string line;
for (auto &pos : positions)
{
......@@ -106,6 +105,10 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
int score;
double normScore;
size_t targetLength = targetDA.size();
int order = targetDA.size();
if (collapse)
targetDA.collapse(true);
order -= targetDA.size();
matrix.gotoh(queryDA, targetDA);
score = matrix.score();
if (score < scoreThres)
......@@ -124,9 +127,7 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
string name(line.begin(), line.begin()+pos);
char *tmp;
c_line += pos;
//auto daIt = targetDAs.emplace(name, BSDL::DomainArrangement<BSDL::Domain>());
auto hitIt = results.emplace(std::piecewise_construct, std::forward_as_tuple(score), std::forward_as_tuple(name, normScore, strtoul(c_line, &tmp, 10)));
auto hitIt = results.emplace(std::piecewise_construct, std::forward_as_tuple(score, order), std::forward_as_tuple(name, normScore, strtoul(c_line, &tmp, 10)));
size_t start, end;
double eval;
c_line = tmp;
......@@ -142,6 +143,4 @@ DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::Ali
}
}
}
/*RadsHit x;
x.target_name;*/
}
......@@ -20,6 +20,9 @@
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef DBACCESS_HPP
#define DBACCESS_HPP
#include <boost/filesystem.hpp>
// BioSeqDataLib
......@@ -34,10 +37,29 @@
namespace BSDL = BioSeqDataLib;
namespace fs = boost::filesystem;
/*
struct RadsOptions
{
enum Algorithm
{
NW, NW_RASPODOM, GOTOH, GOTOH_RASPODOM
};
// scoring options
BSDL::DSM similarityMatrix;
int gop;
int gep;
// domain content options
bool all;
bool collapse;
Algorithm algo;
};*/
struct RadsHit
{
std::string targetName;
double normalized;
size_t length;
......@@ -51,6 +73,7 @@ struct RadsHit
};
class DBAccess
{
private:
......@@ -59,6 +82,16 @@ class DBAccess
BSDL::DSM similarityMatrix_; // The similarity matrix to be used.
public:
struct comparePairs {
bool operator()(const std::pair<int, int>& lhs, const std::pair<int, int>& rhs)
{
if (lhs.first != rhs.first)
return lhs.first > rhs.first;
else
return lhs.second < rhs.second;
}
};
/**
* @brief Construct a new DBAccess object
*
......@@ -100,6 +133,10 @@ class DBAccess
* @param results The results sorted by score
*/
void
search(const BSDL::DomainArrangement<BSDL::Domain> &query, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, int scoreThres,
std::multimap<int, RadsHit, std::greater<int> > &results);
search(const BSDL::DomainArrangement<BSDL::Domain> &query, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, bool collapse, int scoreThres,
std::multimap<std::pair<int,int>, RadsHit, comparePairs > &results);
};
#endif
......@@ -20,6 +20,10 @@
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef DBCREATOR_HPP
#define DBCREATOR_HPP
#include <map>
#include <set>
#include <string>
......@@ -115,3 +119,5 @@ class DBCreator
write(const fs::path &prefix, const std::map<std::string, std::string> &info);
};
#endif
......@@ -29,7 +29,6 @@
#include <thread>
#include <ctime>
#include <chrono>
#include <omp.h>
// Boost
......@@ -89,10 +88,12 @@ main(int argc, char *argv[])
;
bool all;
bool collapse;
int minScore;
po::options_description filterOpts("Result filtering options");
filterOpts.add_options()
("all,a", po::value<bool>(&all)->default_value(false)->zero_tokens(), "All domain types need to occur")
("collapse,c", po::value<bool>(&collapse)->default_value(false)->zero_tokens(), "Collapse consecutive identical domains")
("min-score,M", po::value<int>(&minScore)->default_value(0), "The minimum alignment score to list")
;
......@@ -147,6 +148,7 @@ main(int argc, char *argv[])
try
{
simMat.read(matrixName);
simMat.useNegative(true);
}
catch (std::exception &e)
{
......@@ -201,6 +203,8 @@ main(int argc, char *argv[])
<< "# gap open penalty " << std::to_string(gop) << "\n"
<< "# gap extension penalty " << std::to_string(gep) << "\n"
<< "# matrix: " << print_matrix_name << "\n"
<< "# all: " << (all ? "true" : "false") << "\n"
<< "# collapse: " << (collapse ? "true" : "false") << "\n"
<< "# ******************************************************************\n"
<< "\n";
......@@ -212,8 +216,8 @@ main(int argc, char *argv[])
auto it=querySet.begin();
for (size_t j=0; j<i; ++j)
++it;
std::multimap<int, RadsHit, std::greater<int> > results;
db.search(it->second, matrices[omp_get_thread_num()], all, minScore, results);
std::multimap<std::pair<int, int>, RadsHit, DBAccess::comparePairs > results;
db.search(it->second, matrices[omp_get_thread_num()], all, collapse, minScore, results);
// results are written to a buffer first to reduce time in critical section
std::stringstream buf;
......@@ -231,7 +235,7 @@ main(int argc, char *argv[])
for (auto &result : results)
{
const auto &hit = result.second;
buf << result.first << "\t" << hit.normalized << "\t" << hit.targetName << "\t" << hit.length << "\t";
buf << result.first.first << "\t" << hit.normalized << "\t" << hit.targetName << "\t" << hit.length << "\t";
const auto &da = hit.da;
size_t len = da.size();
buf << da[0].accession() << " " << da[0].start() << " " << da[0].end();
......
......@@ -52,7 +52,7 @@
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
<match id="PF00033s" name="Carbam_trans_C" dbname="PFAMdgf" status="T" evd="HMMPfam">
<match id="PF00033" name="Carbam_trans_C" dbname="PFAMdgf" status="T" evd="HMMPfam">
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
......@@ -94,4 +94,19 @@
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
</protein>
<protein id="test-seq5" name="test-seq1" length="530" crc64="AA3D2FB934A14063">
<match id="PF00001" name="Carbam_trans_N" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR003696" name="Carbamoyltransferase" type="Domain" />
<lcn start="10" end="63" score="2.6E-8" />
</match>
<match id="PF00002" name="Carbam_trans_N" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR003696" name="Carbamoyltransferase" type="Domain" />
<lcn start="104" end="312" score="4.8E-17" />
</match>
<match id="PF00002" name="Carbam_trans_C" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
</protein>
\ No newline at end of file
# RADS version 2.1.2
# RADS version 2.2.0
# RADS Output v1
# run at Thu Mar 1 17:11:58 2018
# run at Fri Apr 20 13:50:57 2018
#
# query file: -
# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test
# gap open penalty -50
# gap extension penalty -10
# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm
# all: false
# collapse: false
# ******************************************************************
# -------------------------------------------------------------------
......@@ -16,5 +20,6 @@ Domain arrangement: PF00001 PF00002 PF00003
300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524
170 0.64 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524
# RADS version 2.2.0
# RADS Output v1
# run at Fri Apr 20 14:19:09 2018
#
# query file: -
# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/interPro-test
# gap open penalty -50
# gap extension penalty -10
# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm
# all: false
# collapse: true
# ******************************************************************
# -------------------------------------------------------------------
Results for: manual entered query
Domain arrangement: PF00001 PF00002 PF00003
# score | normalized | SeqID | sequence length | domain arrangement
# -------------------------------------------------------------------
300 1.00 test-seq1 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
300 1.00 test-seq2 530 PF00001 10 63 PF00002 104 312 PF00003 362 524
190 0.69 test-seq3 530 PF00002 104 312 PF00003 362 524
190 0.69 test-seq5 530 PF00001 10 63 PF00002 104 312 PF00002 362 524
......@@ -55,7 +55,13 @@
run diff <(grep -v '#' test3Res.txt) <(grep -v '#' results/test3Res.txt)
[ $status == 0 ]
run ../../build/rads -D PF00001 PF00002 PF00003 -m pfam-31.dsm -d interPro-test -o test4Res.txt -c
[ $status == 0 ]
run diff <(grep -v '#' test4Res.txt) <(grep -v '#' results/test4Res.txt)
[ $status == 0 ]
rm interPro-test.db interPro-test.da
rm test3Res.txt test2Res.txt
rm test3Res.txt test2Res.txt test4Res.txt
}
......@@ -32,6 +32,11 @@
BOOST_AUTO_TEST_SUITE(DB_Test)
namespace BSDL=BioSeqDataLib;
BOOST_AUTO_TEST_CASE( cleanDA_TEST)
{
}
/*
BOOST_AUTO_TEST_CASE( cleanDA_TEST)
{
......
......@@ -29,5 +29,5 @@
#include <boost/test/unit_test.hpp>
#include "db_Test.hpp"
#include "DBAccess_Test.hpp"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment