Commit fdd8d6e4 authored by Carsten Kemena's avatar Carsten Kemena

implementing an index based DB

parent 76d1f8d1
......@@ -3,6 +3,7 @@ test library:
only:
- master
script:
- export PATH=/global/group/programs/gcovr/gcovr-3.3/scripts/:$PATH
- git submodule init
- git submodule update
- mkdir build
......@@ -11,4 +12,4 @@ test library:
- make -j 2
- make test
- cd ..
- gcovr -r . -e "libs" -e tests
- gcovr -r $(pwd) -e "libs" -e tests
......@@ -55,9 +55,9 @@ endif()
# boost
if (WITH_UNIT_TEST)
FIND_PACKAGE(Boost 1.54 COMPONENTS system program_options iostreams filesystem unit_test_framework REQUIRED)
FIND_PACKAGE(Boost 1.60 COMPONENTS system program_options iostreams filesystem unit_test_framework serialization REQUIRED)
else (WITH_UNIT_TEST)
FIND_PACKAGE(Boost 1.54 COMPONENTS system program_options iostreams filesystem REQUIRED)
FIND_PACKAGE(Boost 1.60 COMPONENTS system program_options iostreams filesystem serialization REQUIRED)
endif (WITH_UNIT_TEST)
INCLUDE_DIRECTORIES(SYSTEM ${Boost_INCLUDE_DIR})
......@@ -93,7 +93,7 @@ target_link_libraries(${radiantDB_exe}
${Boost_LIBRARIES} ${SQLITE3_LIBRARY} ${ZLIB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}
)
SET(radiant_src ${PROJECT_SOURCE_DIR}/src/radiant.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp ${BSDL_PATH}utility/stringHelpers.cpp
SET(radiant_src ${PROJECT_SOURCE_DIR}/src/radiant.cpp ${PROJECT_SOURCE_DIR}/src/RadiantDB.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp ${BSDL_PATH}utility/stringHelpers.cpp
${BSDL_PATH}domain/Domain.cpp ${BSDL_PATH}domain/DomainExt.cpp ${BSDL_PATH}domain/PfamDomain.cpp ${BSDL_PATH}domain/DomainArrangement.cpp
${BSDL_PATH}utility/utility.cpp ${BSDL_PATH}external_interfaces/domainProgs.cpp
)
......
#include "RadiantDB.hpp"
using namespace std;
namespace fs = boost::filesystem;
namespace BSDL = BioSeqDataLib;
void
RadiantDB::open(const fs::path &path)
{
ifstream in(path.string()+".index", ios::in | ios::binary);
PrefixType prefix;
std::streampos pos;
while (in.read((char*)&prefix, sizeof(PrefixType)))
{
in.read((char*)&pos, sizeof(std::streampos));
index_[prefix] = pos;
}
dbFile_.open(path.string(), ios::in | ios::binary);
}
void
RadiantDB::build(const fs::path &path, bool forward)
{
}
int
RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix) const
{
auto it = database_.find(prefix);
// read new suffixes from file if necessary
if (it == database_.end())
{
auto y = index_.find(prefix);
if (y != index_.end())
{
it = database_.emplace(prefix, std::vector<SuffixAcc>()).first;
dbFile_.seekg(y->second);
// read suffixe and accession from file
size_t numSuffixes;
dbFile_.read((char*)&numSuffixes, sizeof(size_t));
it->second.resize(numSuffixes);
dbFile_.read((char*)&(it->second[0]), numSuffixes*(sizeof(SuffixAcc)));
}
}
// perform search if necessary
if (it != database_.end())
{
auto it2 = lower_bound(it->second.begin(), it->second.end(), suffix);
if (it2 != it->second.end())
{
if (it2->suffix.suffix == suffix.suffix)
{
return it2->accession;
//auto tmp = it2->accession;
//if (tmp != 0)
// return tmp
}
else
{
auto tmp = it2->accession;
if (it2 != it->second.begin())
{
--it2;
if (it2->accession == tmp)
return tmp;
//assignment.emplace(l-(multiplyer*j), std::pair<unsigned short, bool>(tmp, +it2->suffix.position));
}
}
}
}
return 0;
}
void
RadiantDB::write_(const fs::path &path, const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database) const
{
std::ofstream fout(path.string(), std::ios::out | std::ios::binary);
std::ofstream fout_index(path.string()+".index", std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
{
size_t val= it->second.size();
fout_index.write((char*)&it->first, sizeof(PrefixType));
auto x = fout.tellp();
fout_index.write((char*)&x, sizeof(std::streampos));
fout.write((char*)&val, sizeof(size_t));
for (auto it2=it->second.begin(); it2!=it->second.end(); ++it2)
{
fout.write((char*)&(it2->first), sizeof(CodedSuffix));
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();
fout_index.close();
}
#ifndef RADIANTDB_HPP
#define RADIANTDB_HPP
// standard header
#include <iostream> // for operator<<, cout, ostream, etc
#include <vector> // for vector
#include <unordered_map>
// Boost header
#include <boost/filesystem.hpp>
// BSDL header
#include "../libs/BioSeqDataLib/src/external/Input.hpp"
#include "../libs/BioSeqDataLib/src/external/Output.hpp"
#include "common.hpp"
namespace fs = boost::filesystem;
class RadiantDB
{
private:
fs::path prefix_;
std::unordered_map<PrefixType, std::streampos> index_;
mutable std::unordered_map<PrefixType, std::vector<SuffixAcc> > database_;
SuffixAcc noSuffix_;
mutable std::ifstream dbFile_;
void
write_(const fs::path &path, const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database) const;
public:
RadiantDB()
{}
RadiantDB(const fs::path &path)
{
open(path);
}
~RadiantDB()
{}
void
open(const fs::path &path);
void
build(const fs::path &path, bool forward=true);
int
getDomID(const PrefixType &prefix, const CodedSuffix &suffix) const;
};
#endif
......@@ -40,6 +40,13 @@ struct CodedSuffix {
CodedSuffix (SuffixType s, bool p): suffix(s), position(p)
{}
template<class Archive>
void serialize(Archive &ar, __attribute__ ((unused)) const unsigned int version)
{
ar & suffix;
ar & position;
}
} __attribute__((packed));
......@@ -55,16 +62,23 @@ struct SuffixAcc
SuffixAcc() : suffix(0, 0), accession(0)
{}
template<class Archive>
void serialize(Archive &ar, __attribute__ ((unused)) const unsigned int version)
{
ar & suffix;
ar & accession;
}
} __attribute__((packed));
bool operator<(const CodedSuffix &l, const CodedSuffix &r)
inline bool operator<(const CodedSuffix &l, const CodedSuffix &r)
{
return l.suffix < r.suffix;
}
bool operator<(const SuffixAcc &l, CodedSuffix r) {
inline bool operator<(const SuffixAcc &l, CodedSuffix r) {
return l.suffix.suffix < r.suffix;
}
......
......@@ -4,6 +4,13 @@ using namespace std;
namespace po = boost::program_options;
namespace fs = boost::filesystem;
#include <boost/archive/binary_iarchive.hpp>
#include <boost/archive/binary_oarchive.hpp>
#include <boost/serialization/serialization.hpp>
#include <boost/serialization/map.hpp>
#include <boost/serialization/unordered_map.hpp>
#include <boost/serialization/string.hpp>
#include <boost/serialization/vector.hpp>
/**
* \brief Write database to file
......@@ -11,8 +18,25 @@ namespace fs = boost::filesystem;
* @param outFile The file to write the database into
*/
void
write2file(const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database, const std::string &outFile)
write2file(std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database, const std::string &outFile)
{
/*std::unordered_map<PrefixType, vector<SuffixAcc> > database_f;
for (auto it=database.begin(); it!=database.end(); ++it)
{
database_f[it->first] = vector<SuffixAcc>();
auto &vec = database_f[it->first];
for (const auto &elem : it->second)
vec.emplace_back(std::move(elem.first), std::move(elem.second));
it->second.clear();
}
std::ofstream ofs(outFile);
boost::archive::binary_oarchive oarch(ofs);
oarch << database_f;
*/
/*
std::ofstream fout(outFile, std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
......@@ -28,6 +52,28 @@ write2file(const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned s
}
}
fout.close();
*/
std::ofstream fout(outFile, std::ios::out | std::ios::binary);
std::ofstream fout_index(outFile+".index", std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
{
size_t val= it->second.size();
fout_index.write((char*)&it->first, sizeof(PrefixType));
auto x = fout.tellp();
fout_index.write((char*)&x, sizeof(std::streampos));
fout.write((char*)&val, sizeof(size_t));
for (auto it2=it->second.begin(); it2!=it->second.end(); ++it2)
{
fout.write((char*)&(it2->first), sizeof(CodedSuffix));
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();
fout_index.close();
}
......
......@@ -120,9 +120,29 @@ cleanSuffixe(D &db)
db.erase(prevIt);
}
// if only one element left its most liklely not very useful and can be removed
if (db.size() == 1)
db.clear();
// test cleaning
if (db.size() > 1)
{
auto prevIt = db.begin();
auto nextIt = ++db.begin();
auto currIt = nextIt++;
while (nextIt != endIt)
{
if ((nextIt->second == prevIt->second) && (currIt->second == 0))
currIt = db.erase(currIt);
else
{
++currIt;
++prevIt;
}
++nextIt;
}
}
}
template<typename S, typename D>
......@@ -296,61 +316,6 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
#pragma omp task firstprivate (it)
{
cleanSuffixe(it->second);
/*
auto it2 = it->second.begin();
auto it2End = it->second.end();
// remove multiple domain matching words
while (it2 != it2End)
{
if (it2->second == 0)
it2 = it->second.erase( it2 );
else
++it2;
}
if (it->second.size() > 1)
{
auto itCurr = it->second.begin();
auto itPrev = itCurr++;
while (itCurr != it2End)
{
if (itPrev->second != itCurr->second)
{
itPrev = it->second.erase(itPrev);
++itCurr;
}
else
break;
}
}
if (it->second.size() > 2)
{
auto itPrev = it->second.begin();
auto itCurr = itPrev;
++itCurr;
auto itNext = itCurr;
++itNext;
while (itNext != it2End)
{
if (((itCurr->second != itPrev->second) && (itPrev->second != itNext->second) && (itCurr->second != itNext->second)) || ((itCurr->second == itPrev->second) && (itCurr->second == itNext->second)))
itCurr = it->second.erase(itCurr);
else
{
++itCurr;
++itPrev;
}
++itNext;
}
//if (itPrev->second != itCurr->second)
// it->second.erase(itCurr);
}
if (it->second.size() == 1)
it->second.erase(it->second.begin());
*/
}
}
}
......@@ -370,6 +335,7 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
}
}
}
}
#endif
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment