Commit f1797fc0 authored by Carsten Kemena's avatar Carsten Kemena

improving Code Structure

[makeRadsDB]
-using a class (DBCreator) to contain all the functionality to make the database
parent 44e4d60a
/*
* DBCreator.cpp
* @Author : Carsten Kemena (c.kemena@uni-muenster.de)
* @Link :
* @Date : 4/16/2018, 9:26:11 AM
*/
#include "DBCreator.hpp"
#include <boost/regex.hpp>
#include "../libs/BioSeqDataLib/src/sequence/SequenceSet.hpp"
#include "external/Output.hpp"
#include "external/SQLiteDB.hpp"
using namespace std;
// static
void
DBCreator::shortenEvalue_(std::string &evalue)
{
auto end = evalue.find('e');
auto start = end;
if (end != std::string::npos)
{
while (evalue[--start] == '0');
if (evalue[start] != '.')
++start;
if (start != end)
evalue.erase(evalue.begin()+start, evalue.begin()+end);
}
}
void
DBCreator::cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<string> &querySet)
{
// in case of large overlaps of domains only one domain is kept. The importance by database is: Pfam, Superfamily, Smart, Prosite, Gene3dD
std::sort (da.begin(), da.end(), [](const BSDL::Domain &a, const BSDL::Domain &b){return a.start() < b.start();} );
for (const string &keepName : querySet)
{
set<size_t> toDelete;
bool changed = true;
while (changed)
{
size_t nDomains = da.size();
for (size_t i=1; i<nDomains; ++i)
{
BSDL::Domain &dom1 = da[i-1];
BSDL::Domain &dom2 = da[i];
if ((dom1.end() > dom2.start()+this->filter_threshold_) && ((dom1.accession() == keepName) || (dom2.accession() == keepName)))
{
if (dom1.accession() != dom2.accession())
toDelete.insert(dom1.accession() == keepName ? i : i-1);
}
}
for (auto it=toDelete.rbegin(); it!=toDelete.rend(); ++it)
da.erase(da.begin() + (*it));
if (toDelete.empty())
changed = false;
toDelete.clear();
}
}
vector<string> importance = {"PF", "SS", "SM", "PS", "G3"};
for (const string &prefix : importance)
{
vector<size_t> toDelete;
bool changed = true;
while (changed)
{
size_t nDomains = da.size();
for (size_t i=1; i<nDomains; ++i)
{
BSDL::Domain &dom1 = da[i-1];
BSDL::Domain &dom2 = da[i];
string pre1 = dom1.accession().substr(0,2);
string pre2 = dom2.accession().substr(0,2);
if ((dom1.end() > dom2.start()+10) && ((pre1 == prefix) || (pre2 == prefix)))
{
if (pre1 == pre2)
{
if ((dom1.end() - dom1.start()) < (dom2.end() - dom2.start()))
toDelete.push_back(i);
else
toDelete.push_back(i-1);
}
else
toDelete.push_back(pre1 == prefix ? i : i-1);
}
}
for (int i=toDelete.size()-1; i>=0; --i)
da.erase(da.begin() + toDelete[i]);
if (toDelete.empty())
changed = false;
toDelete.clear();
}
}
}
void
DBCreator::addDomainArrangement_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::string &proteinID, const std::string &proteinLength)
{
sort(da.begin(), da.end());
if (this->performFiltering_)
{
std::set<string> empty;
this->cleanDA_(da, empty);
}
string id = da[0].accession();
stringstream stream;
stream << std::scientific << da[0].evalue();
string eval = stream.str();
this->shortenEvalue_(eval);
string positions = proteinID + " " + proteinLength + " " + to_string(da[0].start()) + " " + to_string(da[0].end()) + " " + eval;
size_t nDomains = da.size();
for (size_t i=1; i <nDomains; ++i)
{
id += ";" + da[i].accession();
stream.str(std::string());
stream.clear();
stream << da[i].evalue();
string eval = stream.str();
this->shortenEvalue_(eval);
positions += " " + to_string(da[i].start()) + " " + to_string(da[i].end()) + " " + eval;
}
dbContent_[id].emplace_back(positions);
da.clear();
}
void
DBCreator::readInterPro(const fs::path &interProFile, const std::set<std::string> &dbNames)
{
auto itDBEnd = dbNames.end();
AlgorithmPack::Input inS;
inS.open(interProFile);
string line;
bool insertThisDomain = false;
string protein, length;
BSDL::DomainArrangement<BSDL::Domain> da;
string domainAcc;
string dbName;
// parse interpro annotation of uniprot
boost::regex protIDre("protein id=\"([^\"]+)\".*length=\"([0-9]+)\"");
boost::regex dbRe("<match id=\"([^\"]+)\".*dbname=\"([^\"]+)\"");
boost::regex positionsRe("start=\"([0-9]+)\" end=\"([0-9]+)\" score=\"([0-9eE.-]+)\"");
boost::match_results<std::string::const_iterator> m;
while (getline(inS, line))
{
// identify protein
if (boost::regex_search(line, m, protIDre))
{
protein = string(m[1].first, m[1].second);
length = string(m[2].first, m[2].second);
}
else
{
// identify domain
if (boost::regex_search (line, m, dbRe))
{
string dom(m[1].first, m[1].second);
string database(m[2].first, m[2].second);
if (dbNames.find(database) != itDBEnd)
{
domainAcc = dom;
dbName = database;
insertThisDomain = true;
}
else
insertThisDomain = false;
}
else
{
// get positions of domain
if (insertThisDomain)
{
if (boost::regex_search (line, m, positionsRe))
{
string start(m[1].first, m[1].second);
string end(m[2].first, m[2].second);
string score(m[3].first, m[3].second);
da.emplace_back(domainAcc, stoul(start), stoul(end), stod(score));
}
}
// end of protein, store in dbContent
if ((line.find("</protein>") != string::npos) && (!da.empty()))
this->addDomainArrangement_(da, protein, length);
}
}
}
inS.close();
}
void
DBCreator::readAnnotationFile(const fs::path &annotationFile, const fs::path &sequenceFile)
{
BSDL::DomainArrangementSet<BSDL::Domain> domSet;
domSet.read(annotationFile);
// sequences are only needed for the protein length.
BSDL::SequenceSet<BSDL::Sequence<>> seqSet;
if (!sequenceFile.empty())
seqSet.read(sequenceFile);
std::map<std::string, size_t> seqLengths;
for (auto &seq : seqSet)
seqLengths[seq.name()] = seq.size();
// read each sequence seperately
for (auto pair : domSet)
{
string seqLength = sequenceFile.empty() ? "0" : to_string(seqLengths[pair.first]);
this->addDomainArrangement_(pair.second, pair.first, seqLength);
}
}
std::pair<size_t, size_t>
DBCreator::write(const fs::path &prefix, const std::map<std::string, std::string> &info)
{
AlgorithmPack::Output out;
fs::path dbFile = prefix;
dbFile.replace_extension(".da");
out.open(dbFile);
SQLiteDB db;
dbFile.replace_extension(".db");
db.open(dbFile);
// Create tables
db.exec("CREATE TABLE IF NOT EXISTS domain('id' INT PRIMARY KEY, 'accession' VARCHAR(45) NULL,'position' INT NULL)");
db.exec("CREATE TABLE IF NOT EXISTS info('id' INT PRIMARY KEY, 'argument' VARCHAR(20), 'value' VARCHAR(200) NULL)");
db.startTransaction();
size_t nSeqs = 0;
for (const auto pair : info)
db.exec("INSERT INTO info VALUES(NULL,'"+ pair.first + "','" + pair.second + "')");
for (auto &arrangements : dbContent_)
{
std::streampos filePos = out.tellp();
auto tokens = BSDL::split(arrangements.first, ";");
set <string> doms;
for (auto &token : tokens)
doms.insert(token);
for (auto &token : doms)
db.exec("INSERT INTO domain VALUES(NULL,'"+ token + "','" + to_string(filePos) + "')");
nSeqs += arrangements.second.size();
out << ">" << arrangements.first << "\n";
for (auto arrangement : arrangements.second)
out << arrangement << "\n";
}
out.close();
db.endTransaction();
db.exec("CREATE INDEX IF NOT EXISTS domIdx ON domain(accession)");
return std::pair<size_t, size_t>(nSeqs, dbContent_.size());
}
/*
* DBCreator.hpp
* @Author : Carsten Kemena (c.kemena@uni-muenster.de)
* @Link :
* @Date : 4/16/2018, 9:26:11 AM
*/
#include <map>
#include <set>
#include <string>
#include <vector>
#include <utility>
#include <boost/filesystem.hpp>
#include "../libs/BioSeqDataLib/src/DomainModule.hpp"
namespace BSDL = BioSeqDataLib;
namespace fs = boost::filesystem;
class DBCreator
{
private:
bool performFiltering_;
int filter_threshold_;
// all DAs need to be stored first as they need to be sorted according to the domain arrangement
// The key is a ';'seperated list of domain accessions, the value a string with of the form
// protein_id length start end eval
// the start end eval are repeated for as many domains as the arrangement contains
std::map<std::string, std::vector<std::string> > dbContent_;
/**
* @brief Remove trainling zeros of an evalue.
*
* @param evalue The evalue to shorten.
*/
void static
shortenEvalue_(std::string &evalue);
void
cleanDA_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::set<std::string> &querySet);
/**
* @brief Adds a single domain Arrangement to the database
*
* @param da The DA to add
* @param proteinID The id of the protein
* @param proteinLength the length of the protein
*
*/
void addDomainArrangement_(BSDL::DomainArrangement<BSDL::Domain> &da, const std::string &proteinID, const std::string &proteinLength);
public:
DBCreator()
{
performFiltering_ = false;
filter_threshold_ = 10;
}
void
filter(bool doFilter)
{
performFiltering_ = doFilter;
}
void
threshold(int t)
{
filter_threshold_ = t;
}
/**
* @brief Reads a standard domain annotation file.
*
* The sequence file is needed to properly set the sequence length in the database. If none is provided the sequence length will be zero.
* @param annotationFile The annotation file
* @param sequenceFile The sequence file
*/
void
readAnnotationFile(const fs::path &annotationFile, const fs::path &sequenceFile=0);
/**
* @brief Reads the interpro file.
*
* @param inputFile The InterPro file path
* @param dbNames The database to include
*/
void
readInterPro(const fs::path &interProFile, const std::set<std::string> &dbNames);
/**
* @brief Writes the whole database to file
*
* @param prefix The prefix used to create the database files.
* @param info Additional information that is added to the info table.
* @return
*/
std::pair<size_t, size_t>
write(const fs::path &prefix, const std::map<std::string, std::string> &info);
};
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment