Commit b1944f4a authored by Carsten Kemena's avatar Carsten Kemena

internal code improvements

-moved rads non option parsing code to DBAccess class
-removed now unsupported option of makedb to add several interpro databases to rads database
parent f1797fc0
v. 2.2.0
- internal code improvements
v. 2.1.3
- added gop/gep information to output file
- added commandline and rads version to databse file in additional table
......
......@@ -3,8 +3,8 @@ cmake_minimum_required(VERSION 2.6)
project (RADS C CXX)
SET(MAJOR_VERSION 2)
SET(MINOR_VERSION 1)
SET(PATCH_VERSION 3)
SET(MINOR_VERSION 2)
SET(PATCH_VERSION 0)
SET(CMAKE_CXX_FLAGS_COVERAGE
......@@ -97,14 +97,14 @@ ${BSDL_PATH}utility/DSM.cpp ${BSDL_PATH}utility/stringHelpers.cpp
)
SET(rads_src ${PROJECT_SOURCE_DIR}/src/rads.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${PROJECT_SOURCE_DIR}/src/db.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp)
SET(rads_src ${PROJECT_SOURCE_DIR}/src/rads.cpp ${PROJECT_SOURCE_DIR}/src/DBAccess.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp)
SET(rads_exe rads )
ADD_EXECUTABLE(${rads_exe} ${rads_src})
target_link_libraries(${rads_exe}
${Boost_LIBRARIES} ${SQLITE3_LIBRARY} ${ZLIB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}
)
SET(makeDB_src ${PROJECT_SOURCE_DIR}/src/makeRadsDB.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${PROJECT_SOURCE_DIR}/src/db.cpp ${BSDL_src})
SET(makeDB_src ${PROJECT_SOURCE_DIR}/src/makeRadsDB.cpp ${PROJECT_SOURCE_DIR}/src/external/SQLiteDB.cpp ${PROJECT_SOURCE_DIR}/src/DBCreator.cpp ${BSDL_src})
SET(makeDB_exe makeRadsDB)
ADD_EXECUTABLE(${makeDB_exe} ${makeDB_src})
target_link_libraries(${makeDB_exe}
......
/*
* DBAccess.cpp
* @Author : Carsten Kemena (c.kemena@uni-muenster.de)
* @Link :
* @Date : 4/17/2018, 8:50:02 AM
*
* This file is part of RADS.
*
* RADS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RADS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#include "DBAccess.hpp"
using namespace std;
DBAccess::DBAccess(const fs::path &prefix, const fs::path &matrix)
{
open(prefix);
setMatrix(matrix);
}
void
DBAccess::setMatrix(const fs::path &matrix)
{
similarityMatrix_.read(matrix);
similarityMatrix_.useNegative(true);
}
void DBAccess::open(const fs::path &prefix)
{
// open connection to database
fs::path dbFile = prefix;
dbFile.replace_extension(".db");
index_.open(dbFile.string(), SQLITE_OPEN_READONLY);
dbFile.replace_extension(".da");
arrangementDB_.open(dbFile);
}
void
DBAccess::search(const BSDL::DomainArrangement<BSDL::Domain> &queryDA, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, int scoreThres,
std::multimap<int, RadsHit, std::greater<int> > &results)
{
string query = "Select position from domain where accession in ('" + queryDA[0].accession() + "'";
size_t nDomains = queryDA.size();
for (size_t i=1; i<nDomains; ++i)
query += ",'" + queryDA[i].accession() + "'";
query += ")";
std::set<std::streampos> positions;
auto storePositions = std::bind([](sqlite3_stmt *stmt, std::set<std::streampos> &positions){positions.emplace(stoul(reinterpret_cast<const char*>(sqlite3_column_text(stmt, 0))));}, std::placeholders::_1, std::ref(positions));
this->index_.exec(query, storePositions);
std::set<string> queryDomSet;
for (const auto &domain : queryDA)
queryDomSet.insert(domain.accession());
size_t queryLength = queryDA.size();
// read the arrangements from file
//BSDL::DomainArrangementSet<BSDL::Domain> targetDAs;
string line;
for (auto &pos : positions)
{
// get the domain arrangement at the given position
this->arrangementDB_.seekg(pos, std::ios_base::beg);
getline(this->arrangementDB_, line);
auto domains = BSDL::split(line, ">;");
size_t nDomains = domains.size();
BSDL::DomainArrangement<BSDL::Domain> targetDA;
for (auto &domain : domains)
targetDA.emplace_back(domain, 0, 0, 0);
// if all domains are required check for that
if (all)
{
bool containsAll = true;
auto counts = BSDL::domainCounts(targetDA);
for (auto &dom: queryDomSet)
{
if (counts.count(dom)==0)
{
containsAll = false;
break;
}
}
// if not all domains are found, go on to next domain arrangement
if (!containsAll)
continue;
}
// calculate score for the domain arrangement
int score;
double normScore;
size_t targetLength = targetDA.size();
matrix.gotoh(queryDA, targetDA);
score = matrix.score();
if (score < scoreThres)
continue;
int minScore = (queryLength+targetLength) * matrix.gep();
normScore = min((score-minScore)*1.0/(queryLength*100-minScore),(score-minScore)*1.0/(targetLength*100-minScore));
// read all sequences having the same domain arrangement
while(getline(this->arrangementDB_, line))
{
if (line[0] =='>')
break;
const char *c_line = line.c_str();
size_t pos = line.find(' ');
string name(line.begin(), line.begin()+pos);
char *tmp;
c_line += pos;
//auto daIt = targetDAs.emplace(name, BSDL::DomainArrangement<BSDL::Domain>());
auto hitIt = results.emplace(std::piecewise_construct, std::forward_as_tuple(score), std::forward_as_tuple(name, normScore, strtoul(c_line, &tmp, 10)));
size_t start, end;
double eval;
c_line = tmp;
for (size_t i=0; i<nDomains; ++i)
{
start = strtoul(c_line, &tmp, 10);
c_line = tmp;
end = strtoul(c_line, &tmp, 10);
c_line = tmp;
eval = strtod(c_line, &tmp);
c_line = tmp;
hitIt->second.da.emplace_back(domains[i], start, end, eval);
}
}
}
/*RadsHit x;
x.target_name;*/
}
/*
* DBAccess.hpp
* @Author : Carsten Kemena (c.kemena@uni-muenster.de)
* @Link :
* @Date : 4/17/2018, 8:50:02 AM
*
* This file is part of RADS.
*
* RADS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RADS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#include <boost/filesystem.hpp>
// BioSeqDataLib
#include "../libs/BioSeqDataLib/src/DomainModule.hpp"
#include "../libs/BioSeqDataLib/src/align/AlignmentMatrix.hpp"
#include "../libs/BioSeqDataLib/src/external/Input.hpp"
#include "external/SQLiteDB.hpp"
namespace BSDL = BioSeqDataLib;
namespace fs = boost::filesystem;
struct RadsHit
{
std::string targetName;
double normalized;
size_t length;
BSDL::DomainArrangement<BSDL::Domain> da;
RadsHit() : targetName(""), normalized(0), length(0)
{}
RadsHit(std::string t, double n, size_t l) : targetName(t), normalized(n), length(l)
{}
};
class DBAccess
{
private:
SQLiteDB index_;
AlgorithmPack::Input arrangementDB_;
BSDL::DSM similarityMatrix_;
public:
DBAccess()
{}
DBAccess(const fs::path &prefix, const fs::path&matrix);
void
open(const fs::path &prefix);
void
setMatrix(const fs::path &matrix);
void
search(const BSDL::DomainArrangement<BSDL::Domain> &query, BSDL::AlignmentMatrix<int, BSDL::DSM> &matrix, bool all, int scoreThres,
std::multimap<int, RadsHit, std::greater<int> > &results);
};
......@@ -3,6 +3,21 @@
* @Author : Carsten Kemena (c.kemena@uni-muenster.de)
* @Link :
* @Date : 4/16/2018, 9:26:11 AM
*
* This file is part of RADS.
*
* RADS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RADS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#include "DBCreator.hpp"
......@@ -31,9 +46,9 @@ DBCreator::shortenEvalue_(std::string &evalue)
}
}
/*
void
DBCreator::cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<string> &querySet)
DBCreator::cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da)
{
// in case of large overlaps of domains only one domain is kept. The importance by database is: Pfam, Superfamily, Smart, Prosite, Gene3dD
std::sort (da.begin(), da.end(), [](const BSDL::Domain &a, const BSDL::Domain &b){return a.start() < b.start();} );
......@@ -99,17 +114,17 @@ DBCreator::cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<st
}
}
}
*/
void
DBCreator::addDomainArrangement_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::string &proteinID, const std::string &proteinLength)
{
sort(da.begin(), da.end());
if (this->performFiltering_)
/*if (this->performFiltering_)
{
std::set<string> empty;
this->cleanDA_(da, empty);
}
}*/
string id = da[0].accession();
stringstream stream;
stream << std::scientific << da[0].evalue();
......@@ -133,9 +148,9 @@ DBCreator::addDomainArrangement_(BSDL::DomainArrangement<BSDL::Domain> &da, cons
void
DBCreator::readInterPro(const fs::path &interProFile, const std::set<std::string> &dbNames)
DBCreator::readInterPro(const fs::path &interProFile, const std::string &includeDB)
{
auto itDBEnd = dbNames.end();
//auto itDBEnd = dbNames.end();
AlgorithmPack::Input inS;
inS.open(interProFile);
......@@ -169,7 +184,7 @@ DBCreator::readInterPro(const fs::path &interProFile, const std::set<std::string
{
string dom(m[1].first, m[1].second);
string database(m[2].first, m[2].second);
if (dbNames.find(database) != itDBEnd)
if (includeDB == database)
{
domainAcc = dom;
dbName = database;
......
......@@ -3,6 +3,21 @@
* @Author : Carsten Kemena (c.kemena@uni-muenster.de)
* @Link :
* @Date : 4/16/2018, 9:26:11 AM
*
* This file is part of RADS.
*
* RADS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RADS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#include <map>
......@@ -21,8 +36,9 @@ namespace fs = boost::filesystem;
class DBCreator
{
private:
bool performFiltering_;
int filter_threshold_;
/*bool performFiltering_;
int filter_threshold_;*/
// all DAs need to be stored first as they need to be sorted according to the domain arrangement
// The key is a ';'seperated list of domain accessions, the value a string with of the form
// protein_id length start end eval
......@@ -37,8 +53,8 @@ class DBCreator
void static
shortenEvalue_(std::string &evalue);
void
cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<std::string> &querySet);
//void
//cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<std::string> &querySet);
/**
* @brief Adds a single domain Arrangement to the database
......@@ -53,11 +69,11 @@ class DBCreator
public:
DBCreator()
{
performFiltering_ = false;
filter_threshold_ = 10;
// performFiltering_ = false;
//filter_threshold_ = 10;
}
void
/* void
filter(bool doFilter)
{
performFiltering_ = doFilter;
......@@ -67,7 +83,7 @@ class DBCreator
threshold(int t)
{
filter_threshold_ = t;
}
}*/
/**
* @brief Reads a standard domain annotation file.
......@@ -86,7 +102,7 @@ class DBCreator
* @param dbNames The database to include
*/
void
readInterPro(const fs::path &interProFile, const std::set<std::string> &dbNames);
readInterPro(const fs::path &interProFile, const std::string &dbName);
/**
* @brief Writes the whole database to file
......
/*
* db.cpp
*
* Created on: Jan 25, 2016
* Author: ckeme_01
* Copyright: 2016
*
* This file is part of RADS.
*
* RADS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RADS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#include "db.hpp"
using namespace std;
namespace BSDL = BioSeqDataLib;
void
cleanDA(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<string> &querySet, unsigned int threshold)
{
// in case of large overlaps of domains only one domain is kept. The importance by database is: Pfam, Superfamily, Smart, Prosite, Gene3dD
std::sort (da.begin(), da.end(), [](const BSDL::Domain &a, const BSDL::Domain &b){return a.start() < b.start();} );
for (const string &keepName : querySet)
{
set<size_t> toDelete;
bool changed = true;
while (changed)
{
size_t nDomains = da.size();
for (size_t i=1; i<nDomains; ++i)
{
BSDL::Domain &dom1 = da[i-1];
BSDL::Domain &dom2 = da[i];
if ((dom1.end() > dom2.start()+threshold) && ((dom1.accession() == keepName) || (dom2.accession() == keepName)))
{
if (dom1.accession() != dom2.accession())
toDelete.insert(dom1.accession() == keepName ? i : i-1);
}
}
for (auto it=toDelete.rbegin(); it!=toDelete.rend(); ++it)
da.erase(da.begin() + (*it));
if (toDelete.empty())
changed = false;
toDelete.clear();
}
}
vector<string> importance = {"PF", "SS", "SM", "PS", "G3"};
for (const string &prefix : importance)
{
vector<size_t> toDelete;
bool changed = true;
while (changed)
{
size_t nDomains = da.size();
for (size_t i=1; i<nDomains; ++i)
{
BSDL::Domain &dom1 = da[i-1];
BSDL::Domain &dom2 = da[i];
string pre1 = dom1.accession().substr(0,2);
string pre2 = dom2.accession().substr(0,2);
if ((dom1.end() > dom2.start()+10) && ((pre1 == prefix) || (pre2 == prefix)))
{
if (pre1 == pre2)
{
if ((dom1.end() - dom1.start()) < (dom2.end() - dom2.start()))
toDelete.push_back(i);
else
toDelete.push_back(i-1);
}
else
toDelete.push_back(pre1 == prefix ? i : i-1);
}
}
for (int i=toDelete.size()-1; i>=0; --i)
da.erase(da.begin() + toDelete[i]);
if (toDelete.empty())
changed = false;
toDelete.clear();
}
}
}
/*
* db.hpp
*
* Created on: Jan 25, 2016
* Author: ckeme_01
* Copyright: 2016
*
* This file is part of RADS.
*
* RADS is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* RADS is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with RADS. If not, see <http://www.gnu.org/licenses/>.
*/
#ifndef SRC_EXTERNAL_DB_HPP_
#define SRC_EXTERNAL_DB_HPP_
// C++
#include <map>
#include <set>
#include <string>
#include <vector>
#include <fstream>
#include <iostream>
// Boost
#include <boost/program_options.hpp>
#include <boost/filesystem.hpp>
// BSDL header
#include "../libs/BioSeqDataLib/src/DomainModule.hpp"
// other
#include "external/SQLiteDB.hpp"
namespace BSDL = BioSeqDataLib;
void
cleanDA(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<std::string> &querySet, unsigned int threshold);
#endif /* SRC_EXTERNAL_DB_HPP_ */
This diff is collapsed.
This diff is collapsed.
......@@ -13,7 +13,7 @@ FUNCTION(PREPEND var prefix)
SET(${var} "${listVar}" PARENT_SCOPE)
ENDFUNCTION(PREPEND)
SET(tests_src ./unitTests/tests.cpp ../src/external/SQLiteDB.cpp ../src/db.cpp ${BSDL_src})
SET(tests_src ./unitTests/tests.cpp ../src/external/SQLiteDB.cpp ${BSDL_src})
SET(tests_exe tests)
ADD_EXECUTABLE(${tests_exe} ${tests_src})
target_link_libraries(${tests_exe}
......
......@@ -39,6 +39,24 @@
<lcn start="362" end="524" score="1.9E-44" />
</match>
</protein>
<protein id="test-seq45" name="test-seq13" length="530" crc64="AA3D2FB934A14063">
<match id="PF00031" name="Carbam_trans_N" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR003696" name="Carbamoyltransferase" type="Domain" />
<lcn start="10" end="63" score="2.6E-8" />
</match>
<match id="PF00032" name="Carbam_trans_N" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR003696" name="Carbamoyltransferase" type="Domain" />
<lcn start="104" end="312" score="4.8E-17" />
</match>
<match id="PF00033" name="Carbam_trans_C" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
<match id="PF00033s" name="Carbam_trans_C" dbname="PFAMdgf" status="T" evd="HMMPfam">
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
</protein>
<protein id="test-seq2" name="test-seq2" length="530" crc64="AA3D2FB934A14063">
<match id="PF00002" name="Carbam_trans_N" dbname="PFAM" status="T" evd="HMMPfam">
......@@ -55,7 +73,7 @@
</match>
</protein>
<protein id="test-seq3" name="test-seq3" length="530" crc64="AA3D2FB934A14063">
<protein id="test-seq3" name="test-seq3" length="530" crc64="AA3D2FB934A14063">
<match id="PF00002" name="Carbam_trans_N" dbname="PFAM" status="T" evd="HMMPfam">
<ipr id="IPR003696" name="Carbamoyltransferase" type="Domain" />
<lcn start="104" end="312" score="4.8E-17" />
......@@ -64,4 +82,16 @@
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
</protein>
<protein id="test-seqnotin" name="test-seq3" length="530" crc64="AA3D2FB934A14063">
<match id="PF00002" name="Carbam_trans_N" dbname="PFAMS" status="T" evd="HMMPfam">
<ipr id="IPR003696" name="Carbamoyltransferase" type="Domain" />
<lcn start="104" end="312" score="4.8E-17" />
</match>
<match id="PF00003" name="Carbam_trans_C" dbname="PFAMS" status="T" evd="HMMPfam">
<ipr id="IPR031730" name="Carbamoyltransferase, C-terminal" type="Domain" />
<lcn start="362" end="524" score="1.9E-44" />
</match>
</protein>
\ No newline at end of file
......@@ -55,5 +55,5 @@
run diff <(grep -v '#' test3Res.txt) <(grep -v '#' results/test3Res.txt)
[ $status == 0 ]
rm interPro-test.db interPro-test.da
#rm interPro-test.db interPro-test.da
}
......@@ -23,7 +23,7 @@
#include <boost/test/unit_test.hpp>
#include "../../src/db.hpp"
//#include "../../src/db.hpp"
#include "../../libs/BioSeqDataLib/src/DomainModule.hpp"
#ifndef DB_TEST_HPP_
......@@ -32,7 +32,7 @@
BOOST_AUTO_TEST_SUITE(DB_Test)
namespace BSDL=BioSeqDataLib;
/*
BOOST_AUTO_TEST_CASE( cleanDA_TEST)
{
BSDL::DomainArrangement<BSDL::Domain> da;
......@@ -57,7 +57,7 @@ BOOST_AUTO_TEST_CASE( cleanDA_TEST)
BOOST_CHECK_EQUAL(da2[0].accession(), "SSF56235");
BOOST_CHECK_EQUAL(da2[1].accession(), "PF00733");
}
}*/
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment