Commit 2951cbd5 authored by Carsten Kemena's avatar Carsten Kemena

reduced backward

parent fdd8d6e4
......@@ -5,49 +5,60 @@ namespace fs = boost::filesystem;
namespace BSDL = BioSeqDataLib;
void
RadiantDB::open(const fs::path &path)
RadiantDB::read(const fs::path &path)
{
ifstream in(path.string()+".index", ios::in | ios::binary);
PrefixType prefix;
std::streampos pos;
while (in.read((char*)&prefix, sizeof(PrefixType)))
database_.clear();
ifstream fin(path.string(), ios::in | ios::binary);
if (!fin.is_open())
throw std::runtime_error("Error! Could not open database file: " + path.string() + "!");
size_t dbLength = 0;
fin.read((char*)&dbLength, sizeof(size_t));
database_.reserve(dbLength);
for (size_t i=0; i<dbLength; ++i)
{
in.read((char*)&pos, sizeof(std::streampos));
index_[prefix] = pos;
// read prefix
PrefixType prefix;
fin.read((char*)&prefix, sizeof(PrefixType));
auto it = database_.emplace(std::move(prefix), vector<SuffixAcc>()).first;
// read suffixe and accession from file
size_t numSuffixes;
fin.read((char*)&numSuffixes, sizeof(size_t));
it->second.resize(numSuffixes);
fin.read((char*)&(it->second[0]), numSuffixes*(sizeof(SuffixAcc)));
}
dbFile_.open(path.string(), ios::in | ios::binary);
fin.close();
}
void
RadiantDB::build(const fs::path &path, bool forward)
RadiantDB::build(const fs::path &inPath, bool forward, const fs::path &outPath)
{
/*std::ofstream fout(outPath, std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
{
size_t val= it->second.size();
fout.write((char*)&it->first, sizeof(PrefixType));
fout.write((char*)&val, sizeof(size_t));
for (auto it2=it->second.begin(); it2!=it->second.end(); ++it2)
{
fout.write((char*)&(it2->first), sizeof(CodedSuffix));
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();*/
}
int
RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix) const
short
RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix, bool &position) const
{
auto it = database_.find(prefix);
// read new suffixes from file if necessary
if (it == database_.end())
{
auto y = index_.find(prefix);
if (y != index_.end())
{
it = database_.emplace(prefix, std::vector<SuffixAcc>()).first;
dbFile_.seekg(y->second);
// read suffixe and accession from file
size_t numSuffixes;
dbFile_.read((char*)&numSuffixes, sizeof(size_t));
it->second.resize(numSuffixes);
dbFile_.read((char*)&(it->second[0]), numSuffixes*(sizeof(SuffixAcc)));
}
}
// perform search if necessary
if (it != database_.end())
{
auto it2 = lower_bound(it->second.begin(), it->second.end(), suffix);
......@@ -55,10 +66,8 @@ RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix) const
{
if (it2->suffix.suffix == suffix.suffix)
{
position = it2->suffix.position;
return it2->accession;
//auto tmp = it2->accession;
//if (tmp != 0)
// return tmp
}
else
{
......@@ -67,8 +76,11 @@ RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix) const
{
--it2;
if (it2->accession == tmp)
{
position = it2->suffix.position;
return tmp;
//assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(tmp, +it2->suffix.position));
}
}
}
}
......
......@@ -17,6 +17,9 @@
namespace fs = boost::filesystem;
class RadiantDB
{
private:
......@@ -29,6 +32,8 @@ private:
void
write_(const fs::path &path, const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database) const;
void
close_();
public:
......@@ -37,22 +42,24 @@ public:
RadiantDB(const fs::path &path)
{
open(path);
read(path);
}
~RadiantDB()
{}
{
dbFile_.close();
}
void
open(const fs::path &path);
read(const fs::path &path);
void
build(const fs::path &path, bool forward=true);
build(const fs::path &inPath, bool forward, const fs::path &outPath);
int
getDomID(const PrefixType &prefix, const CodedSuffix &suffix) const;
short
getDomID(const PrefixType &prefix, const CodedSuffix &suffix, bool &position) const;
};
......
......@@ -20,23 +20,7 @@ namespace fs = boost::filesystem;
void
write2file(std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database, const std::string &outFile)
{
/*std::unordered_map<PrefixType, vector<SuffixAcc> > database_f;
for (auto it=database.begin(); it!=database.end(); ++it)
{
database_f[it->first] = vector<SuffixAcc>();
auto &vec = database_f[it->first];
for (const auto &elem : it->second)
vec.emplace_back(std::move(elem.first), std::move(elem.second));
it->second.clear();
}
std::ofstream ofs(outFile);
boost::archive::binary_oarchive oarch(ofs);
oarch << database_f;
*/
/*
std::ofstream fout(outFile, std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
......@@ -52,8 +36,8 @@ write2file(std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short >
}
}
fout.close();
*/
/*
std::ofstream fout(outFile, std::ios::out | std::ios::binary);
std::ofstream fout_index(outFile+".index", std::ios::out | std::ios::binary);
size_t val= database.size();
......@@ -73,7 +57,7 @@ write2file(std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short >
}
}
fout.close();
fout_index.close();
fout_index.close();*/
}
......
......@@ -155,6 +155,8 @@ void splitSequence(S &seq, D &db, unsigned int windowSize, bool reverse)
bool last = false;
size_t k = 0;
size_t limit = (seq.size() >= windowSize) ? seq.size() - windowSize + 1: 0 ;
if (reverse && (limit > (windowSize*2)))
limit = windowSize * 2;
while (k < limit)
{
if (last)
......
......@@ -23,16 +23,9 @@
#include "../libs/BioSeqDataLib/src/external_interfaces/domainProgs.hpp"
#include "common.hpp"
#include "RadiantDB.hpp"
#include "version.hpp"
#include <boost/archive/binary_iarchive.hpp>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/map.hpp>
#include <boost/serialization/unordered_map.hpp>
#include <boost/serialization/string.hpp>
#include <boost/serialization/vector.hpp>
using namespace std;
using std::chrono::system_clock;
namespace po = boost::program_options;
......@@ -90,7 +83,68 @@ words2arrangement(multiset<pair<size_t, std::pair<unsigned short, bool> > > &wor
return arrangement;
}
BSDL::DomainArrangement<BSDL::Domain>
words2arrangement2(multiset<pair<size_t, std::pair<unsigned short, bool> > > &wordList, int max_dist, float min_count)
{
struct Helper
{
size_t start;
size_t end;
float score;
Helper(size_t s, size_t e, size_t v) : start(s), end(e), score(v)
{}
};
BSDL::DomainArrangement<BSDL::Domain> arrangement;
std::map<unsigned short, Helper> counter;
auto itEnd = wordList.end();
for (auto it = wordList.begin(); it != itEnd; ++it)
{
auto helper = counter.find(it->second.first);
if (helper != counter.end())
{
// the last hit to this domain was too far away.
if ((it->first - helper->second.end) > max_dist)
{
if ((helper->second.score*100.0/(helper->second.end-helper->second.start)) >= min_count)
//if (helper->second.score >= min_count)
{
string acc = to_string(it->second.first);
while (acc.size() < 5)
acc = "0" + acc;
arrangement.emplace_back("PF" + acc, helper->second.start, helper->second.end, (helper->second.end-helper->second.start)/helper->second.score);
}
helper->second.start=helper->second.end = it->first;
helper->second.score = 1;
}
else
{
if (it->first != helper->second.end)
{
++helper->second.score;
helper->second.end = it->first;
}
}
}
else
counter.emplace(std::piecewise_construct, std::forward_as_tuple(it->second.first), std::forward_as_tuple(it->first, it->first, 1));
}
std::sort(arrangement.begin(), arrangement.end());
std::set<size_t> toDelete;
for (auto i = 1; i<arrangement.size(); ++i)
{
if ((arrangement[i-1].end() > arrangement[i].start()) && ((arrangement[i-1].end() - arrangement[i].start()) > 10))
{
if (arrangement[i-1].evalue() < arrangement[i].evalue())
arrangement.erase(arrangement.begin() + (i-1));
else
arrangement.erase(arrangement.begin() + i);
--i;
}
}
return arrangement;
}
/**
* \brief Reads RADIANT database.
......@@ -100,13 +154,6 @@ words2arrangement(multiset<pair<size_t, std::pair<unsigned short, bool> > > &wor
void
readDatabase(const fs::path &inFile, std::unordered_map<PrefixType, vector<SuffixAcc> > &database)
{
/*
database.clear();
std::ifstream ifs(inFile.string());
boost::archive::binary_iarchive iarch(ifs);
iarch >> database;
*/
database.clear();
ifstream fin(inFile.string(), ios::in | ios::binary);
if (!fin.is_open())
......@@ -132,27 +179,28 @@ readDatabase(const fs::path &inFile, std::unordered_map<PrefixType, vector<Suffi
fin.read((char*)&(it->second[0]), numSuffixes*(sizeof(SuffixAcc)));
}
fin.close();
cout << "DB-LENGTH" << database.size() << "\n";
}
void
assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet, vector<multiset<pair<size_t, pair<unsigned short, bool> > > > &assignments, bool reverse)
{
std::unordered_map<PrefixType, vector<SuffixAcc> > database;
readDatabase(inFile, database);
RadiantDB database;
database.read(inFile);
size_t nSeqs = seqSet.size();
assignments.resize(nSeqs);
const size_t wordSize = 18;
PrefixType prefix;
CodedSuffix suffix;
size_t l = 0;
int multiplyer = reverse ? 1 : -1;
set<PrefixType> all;
// #pragma omp parallel for schedule(static,100) private(prefix, suffix)
for (size_t i=0; i<nSeqs; ++i)
{
auto &seq = seqSet[i];
size_t l = 0;
if (reverse)
{
std::reverse(seq.begin(), seq.end());
......@@ -163,7 +211,10 @@ assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet
auto &assignment = assignments[i];
size_t length = seq.size() - wordSize;
if (reverse && (length > (wordSize*2)))
length = wordSize * 2;
bool last = false;
bool position;
for (size_t j=0; j<length; ++j)
{
if (last)
......@@ -172,36 +223,15 @@ assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet
last = getCompletePrefixSuffix(seq, j, prefix, suffix);
if (last)
{
all.insert(prefix);
auto it = database.find(prefix);
if (it != database.end())
auto acc = database.getDomID(prefix, suffix, position);
if (acc != 0)
{
auto it2 = lower_bound(it->second.begin(), it->second.end(), suffix);
if (it2 != it->second.end())
{
if (it2->suffix.suffix == suffix.suffix)
{
auto tmp = it2->accession;
if (tmp != 0)
assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(tmp, +it2->suffix.position));
}
else
{
auto tmp = it2->accession;
if (it2 != it->second.begin())
{
--it2;
if (it2->accession == tmp)
assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(tmp, +it2->suffix.position));
}
}
}
assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(acc, position));
}
}
}
}
}
cout << all.size() << endl;
}
......@@ -244,12 +274,14 @@ main(int argc, char const *argv[])
{
fs::path inFile, databaseFile;
//int nThreads;
po::options_description allOpts("radiant " + radiantVersion + " (C) 2017 Carsten Kemena\nThis program comes with ABSOLUTELY NO WARRANTY;\n\nAllowed options are displayed below.");
po::options_description general("General options");
general.add_options()
("help,h", "Produces this help message")
("in,i", po::value<fs::path>(&inFile)->required(), "The input file")
("database,d", po::value<fs::path>(&databaseFile)->required(), "The database to use")
//("nThreads,t", po::value<int>(&nThreads)->default_value(1), "Number of threads to use")
;
fs::path outFile;
......@@ -297,10 +329,10 @@ main(int argc, char const *argv[])
return EXIT_FAILURE;
}
//omp_set_num_threads(nThreads);
BSDL::SequenceSet<BSDL::Sequence<> > seqSet;
seqSet.read(inFile);
size_t nameLength = 0;
for (auto &seq : seqSet)
{
......@@ -327,6 +359,7 @@ main(int argc, char const *argv[])
}
vector<multiset<pair<size_t, pair<unsigned short, bool> > > >assignments;
assignWords(databaseFile.string() + "_fwd.db", seqSet, assignments, false);
assignWords(databaseFile.string() + "_rev.db", seqSet, assignments, true);
......@@ -355,10 +388,16 @@ main(int argc, char const *argv[])
exit(1);
}
AlgorithmPack::Output out(outFile);
/*for (size_t max_dist = 3; max_dist < 30; ++max_dist)
for (float min_count = 0; min_count <= 0.5; min_count+=0.05 )
{*/
//fs::path tmp = outFile;
// tmp += to_string(max_dist) + "_" + to_string(min_count) + ".txt";
AlgorithmPack::Output out(outFile);
if (!noHeader)
printHeader(radiantVersion, pfamLike, translate, inFile, databaseFile, out);
for (size_t i = 0; i< seqSet.size(); ++i)
{
auto &assignment = assignments[i];
......@@ -386,6 +425,12 @@ main(int argc, char const *argv[])
}
}
}
//}*/
}
catch (std::bad_alloc& ba)
{
std::cerr << "Error! A memory allocation was caught: Please check if you have enough RAM on this machine\n";
return 1;
}
catch (const std::exception &e)
{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment