Commit c6d4b13a authored by Carsten Kemena's avatar Carsten Kemena

Merge branch 'develop' into 'master'

Develop

See merge request !1
parents 370d5e57 aaf64d96
Pipeline #669 failed with stage
in 1 minute and 11 seconds
......@@ -2,6 +2,8 @@ GPATH
GRTAGS
GTAGS
/src/version.hpp
# Compiled Object files
*.slo
*.lo
......
......@@ -3,6 +3,7 @@ test library:
only:
- master
script:
- export PATH=/global/group/programs/gcovr/gcovr-3.3/scripts/:$PATH
- git submodule init
- git submodule update
- mkdir build
......@@ -11,4 +12,4 @@ test library:
- make -j 2
- make test
- cd ..
- gcovr -r . -e "libs" -e tests
- gcovr -r $(pwd) -e "libs" -e tests
Version 0.9.1-beta
- simpler handling of domain information file
- change of database construction and thereby much smaller databases
- simpler handling of databases
Version 0.9.0-beta
- first public release
\ No newline at end of file
- first public release
......@@ -2,9 +2,9 @@ cmake_minimum_required(VERSION 2.6)
project (RADIANT C CXX)
SET(MAJOR_VERSION 1)
SET(MINOR_VERSION 0)
SET(PATCH_VERSION 0)
SET(MAJOR_VERSION 0)
SET(MINOR_VERSION 9)
SET(PATCH_VERSION 1-beta)
SET(CMAKE_CXX_FLAGS_COVERAGE
"-g -O0 --coverage -fprofile-arcs -ftest-coverage -fno-inline -fno-inline-small-functions -fno-default-inline"
......@@ -40,6 +40,7 @@ if( NOT CMAKE_BUILD_TYPE )
"Choose the type of build, options are: Debug Coverage Release RelWithDebInfo MinSizeRel." FORCE )
endif()
cmake_policy(SET CMP0012 NEW)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
......@@ -55,9 +56,9 @@ endif()
# boost
if (WITH_UNIT_TEST)
FIND_PACKAGE(Boost 1.54 COMPONENTS system program_options iostreams filesystem unit_test_framework REQUIRED)
FIND_PACKAGE(Boost 1.54 COMPONENTS system program_options iostreams filesystem unit_test_framework serialization REQUIRED)
else (WITH_UNIT_TEST)
FIND_PACKAGE(Boost 1.54 COMPONENTS system program_options iostreams filesystem REQUIRED)
FIND_PACKAGE(Boost 1.54 COMPONENTS system program_options iostreams filesystem serialization REQUIRED)
endif (WITH_UNIT_TEST)
INCLUDE_DIRECTORIES(SYSTEM ${Boost_INCLUDE_DIR})
......@@ -72,8 +73,8 @@ ADD_DEFINITIONS( "-DHAS_BOOST" )
find_package (Threads REQUIRED)
configure_file (
"${PROJECT_SOURCE_DIR}/src/Version.hpp.in"
"${PROJECT_SOURCE_DIR}/src/Version.hpp"
"${PROJECT_SOURCE_DIR}/src/version.hpp.in"
"${PROJECT_SOURCE_DIR}/src/version.hpp"
)
################################################
......@@ -93,7 +94,7 @@ target_link_libraries(${radiantDB_exe}
${Boost_LIBRARIES} ${SQLITE3_LIBRARY} ${ZLIB_LIBRARIES} ${CMAKE_THREAD_LIBS_INIT}
)
SET(radiant_src ${PROJECT_SOURCE_DIR}/src/radiant.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp ${BSDL_PATH}utility/stringHelpers.cpp
SET(radiant_src ${PROJECT_SOURCE_DIR}/src/radiant.cpp ${PROJECT_SOURCE_DIR}/src/RadiantDB.cpp ${BSDL_src} ${BSDL_PATH}/external_interfaces/domainProgs.cpp ${BSDL_PATH}utility/stringHelpers.cpp
${BSDL_PATH}domain/Domain.cpp ${BSDL_PATH}domain/DomainExt.cpp ${BSDL_PATH}domain/PfamDomain.cpp ${BSDL_PATH}domain/DomainArrangement.cpp
${BSDL_PATH}utility/utility.cpp ${BSDL_PATH}external_interfaces/domainProgs.cpp
)
......
Subproject commit eb77e85ca54c9c06fbc50f004667404e5fd0a763
Subproject commit 342217c4ec244ee94fdc2df8a46b197f592e1952
#include "RadiantDB.hpp"
#include <bitset>
using namespace std;
namespace fs = boost::filesystem;
namespace BSDL = BioSeqDataLib;
void
RadiantDB::read(const fs::path &path)
{
database_.clear();
ifstream fin(path.string(), ios::in | ios::binary);
if (!fin.is_open())
throw std::runtime_error("Error! Could not open database file: " + path.string() + "!");
size_t dbLength = 0;
fin.read((char*)&dbLength, sizeof(size_t));
database_.reserve(dbLength);
for (size_t i=0; i<dbLength; ++i)
{
// read prefix
PrefixType prefix;
fin.read((char*)&prefix, sizeof(PrefixType));
auto it = database_.emplace(std::move(prefix), vector<SuffixAcc>()).first;
// read suffixe and accession from file
size_t numSuffixes;
fin.read((char*)&numSuffixes, sizeof(size_t));
it->second.resize(numSuffixes);
fin.read((char*)&(it->second[0]), numSuffixes*(sizeof(SuffixAcc)));
}
fin.close();
}
/*
void
RadiantDB::build(const fs::path &inPath, bool forward, const fs::path &outPath)
{
std::ofstream fout(outPath, std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
{
size_t val= it->second.size();
fout.write((char*)&it->first, sizeof(PrefixType));
fout.write((char*)&val, sizeof(size_t));
for (auto it2=it->second.begin(); it2!=it->second.end(); ++it2)
{
fout.write((char*)&(it2->first), sizeof(CodedSuffix));
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();
}*/
short
RadiantDB::char_diff_(SuffixType val1, SuffixType val2)
{
SuffixType val = val1^val2;
short counter = 0;
for (short i=0; i< 10; ++i)
{
for (short j=0; j<3; ++j)
{
short n = i*3+j;
if (val & (1<<n))
{
++counter;
break;
}
}
}
return counter;
}
short
RadiantDB::getDomID(const PrefixType &prefix, const CodedSuffix &suffix, bool &position) const
{
auto it = database_.find(prefix);
// read new suffixes from file if necessary
if (it != database_.end())
{
auto it2 = lower_bound(it->second.begin(), it->second.end(), suffix);
if (it2 != it->second.end())
{
if (it2->suffix.suffix == suffix.suffix)
{
position = it2->suffix.position;
return it2->accession;
}
else
{
auto tmp = it2->accession;
auto x = it2->suffix.suffix;
if (it2 != it->second.begin())
{
--it2;
if (it2->accession == tmp)
{
position = it2->suffix.position;
return tmp;
}
else
{
short threshold = 2;
auto diff1 = char_diff_(x, suffix.suffix);
auto diff2 = char_diff_(it2->suffix.suffix, suffix.suffix);
if ((diff1 < diff2) && (diff1 <= threshold))
return tmp;
else
if (diff2 <= threshold)
return it2->accession;
}
}
}
}
}
return 0;
}
/*
void
RadiantDB::write_(const fs::path &path, const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database) const
{
std::ofstream fout(path.string(), std::ios::out | std::ios::binary);
std::ofstream fout_index(path.string()+".index", std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
{
size_t val= it->second.size();
fout_index.write((char*)&it->first, sizeof(PrefixType));
auto x = fout.tellp();
fout_index.write((char*)&x, sizeof(std::streampos));
fout.write((char*)&val, sizeof(size_t));
for (auto it2=it->second.begin(); it2!=it->second.end(); ++it2)
{
fout.write((char*)&(it2->first), sizeof(CodedSuffix));
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();
fout_index.close();
}*/
#ifndef RADIANTDB_HPP
#define RADIANTDB_HPP
// standard header
#include <iostream> // for operator<<, cout, ostream, etc
#include <vector> // for vector
#include <unordered_map>
// Boost header
#include <boost/filesystem.hpp>
// BSDL header
#include "../libs/BioSeqDataLib/src/external/Input.hpp"
#include "../libs/BioSeqDataLib/src/external/Output.hpp"
#include "common.hpp"
namespace fs = boost::filesystem;
class RadiantDB
{
private:
fs::path prefix_;
std::unordered_map<PrefixType, std::vector<SuffixAcc> > database_;
SuffixAcc noSuffix_;
std::ifstream dbFile_;
void
write_(const fs::path &path, const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database) const;
void
close_();
short
static char_diff_(SuffixType val1, SuffixType val2);
public:
RadiantDB()
{}
RadiantDB(const fs::path &path)
{
read(path);
}
~RadiantDB()
{
dbFile_.close();
}
void
read(const fs::path &path);
/*void
build(const fs::path &inPath, bool forward, const fs::path &outPath);
*/
short
getDomID(const PrefixType &prefix, const CodedSuffix &suffix, bool &position) const;
};
#endif
......@@ -21,52 +21,14 @@
namespace BSDL = BioSeqDataLib;
typedef uint64_t SuffixType;
typedef uint32_t PrefixType;
const short suffixLength = 12;
const short alphabetBitSize = 5;
constexpr SuffixType suffixShift = suffixLength * alphabetBitSize;
struct CodedSuffix {
SuffixType suffix;
bool position;
CodedSuffix (): suffix(0), position(false)
{}
CodedSuffix (SuffixType s, bool p): suffix(s), position(p)
{}
} __attribute__((packed));
struct SuffixAcc
{
CodedSuffix suffix;
unsigned short accession;
SuffixAcc(const CodedSuffix &s, unsigned short a) : suffix(s), accession(a)
{}
SuffixAcc() : suffix(0, 0), accession(0)
{}
} __attribute__((packed));
bool operator<(const CodedSuffix &l, const CodedSuffix &r)
{
return l.suffix < r.suffix;
}
bool operator<(const SuffixAcc &l, CodedSuffix r) {
return l.suffix.suffix < r.suffix;
}
typedef uint16_t PrefixType;
typedef uint32_t SuffixType;
const short ALPHABET_BIT_NUM = 3;
const short PREFIX_SIZE = 5;
const short SUFFIX_SIZE = 10;
constexpr short WORD_SIZE = PREFIX_SIZE + SUFFIX_SIZE;
constexpr PrefixType PREFIX_SHIFT = PREFIX_SIZE * ALPHABET_BIT_NUM;
constexpr SuffixType SUFFIX_SHIFT = SUFFIX_SIZE * ALPHABET_BIT_NUM;
/**
......@@ -75,7 +37,7 @@ bool operator<(const SuffixAcc &l, CodedSuffix r) {
* ATSPGNDEQKRHYWFMLIVC
*/
// aa 20
static const std::unordered_map<char, PrefixType> alphabet2bit =
static const std::unordered_map<char, PrefixType> alphabet2bit20 =
{
{'A', 0}, {'T', 1}, {'S', 2}, {'P', 3}, {'G', 4}, {'N', 5},
{'D', 6}, {'E', 7}, {'Q', 8}, {'K', 9}, {'R', 10}, {'H', 11},
......@@ -113,7 +75,44 @@ static const std::unordered_map<char, PrefixType> alphabet2bit8 =
{'V', 7}, {'C', 7}
};
static const std::unordered_map<char, PrefixType> ALPHABET_2_BIT = alphabet2bit8;
struct CodedSuffix {
SuffixType suffix;
bool position;
CodedSuffix (): suffix(0), position(false)
{}
CodedSuffix (SuffixType s, bool p): suffix(s), position(p)
{}
} __attribute__((packed));
struct SuffixAcc
{
CodedSuffix suffix;
unsigned short accession;
SuffixAcc(const CodedSuffix &s, unsigned short a) : suffix(s), accession(a)
{}
SuffixAcc() : suffix(0, 0), accession(0)
{}
} __attribute__((packed));
inline bool operator<(const CodedSuffix &l, const CodedSuffix &r)
{
return l.suffix < r.suffix;
}
inline bool operator<(const SuffixAcc &l, CodedSuffix r) {
return l.suffix.suffix < r.suffix;
}
/**
......@@ -131,11 +130,11 @@ getCompletePrefixSuffix(const S &seq, size_t i, PrefixType &prefix, CodedSuffix
{
// get first prefix
prefix = 0;
auto itEnd = alphabet2bit.end();
for (size_t j=i; j<i+6; ++j)
auto itEnd = ALPHABET_2_BIT.end();
for (size_t j=i; j<i+PREFIX_SIZE; ++j)
{
prefix <<= 5;
auto it = alphabet2bit.find(seq[j]);
prefix <<= ALPHABET_BIT_NUM;
auto it = ALPHABET_2_BIT.find(seq[j]);
if (it != itEnd)
prefix |= it->second;
else
......@@ -145,10 +144,10 @@ getCompletePrefixSuffix(const S &seq, size_t i, PrefixType &prefix, CodedSuffix
// get first suffix
suffix.suffix=0;
for (size_t j=i+6; j<i+18; ++j)
for (size_t j=i+PREFIX_SIZE; j<i+WORD_SIZE; ++j)
{
suffix.suffix <<= 5;
auto it = alphabet2bit.find(seq[j]);
suffix.suffix <<= ALPHABET_BIT_NUM;
auto it = ALPHABET_2_BIT.find(seq[j]);
if (it != itEnd)
suffix.suffix |= it->second;
else
......@@ -173,20 +172,19 @@ bool
getNextPrefixSuffix(const S &seq, size_t i, PrefixType &prefix, CodedSuffix &suffix)
{
// check if new character is ok
auto itSuffix = alphabet2bit.find(seq[i+17]);
if (itSuffix == alphabet2bit.end())
auto itSuffix = ALPHABET_2_BIT.find(seq[i+WORD_SIZE-1]);
if (itSuffix == ALPHABET_2_BIT.end())
return false;
static const PrefixType clearPrefix = ~(PrefixType(3) << 30);
static const SuffixType clearSuffix = ~(SuffixType(15) << suffixShift);
static const PrefixType clearPrefix = ~((uint64_t)-1 << PREFIX_SHIFT);
static const SuffixType clearSuffix = ~((uint64_t)-1 << SUFFIX_SHIFT);
// update prefix
auto itPrefix = alphabet2bit.find(seq[i+5]);
prefix <<= 5;
auto itPrefix = ALPHABET_2_BIT.find(seq[i+PREFIX_SIZE-1]);
prefix <<= ALPHABET_BIT_NUM;
prefix &= clearPrefix;
prefix |= itPrefix->second;
// update suffix
suffix.suffix <<= 5;
suffix.suffix <<= ALPHABET_BIT_NUM;
suffix.suffix &= clearSuffix;
suffix.suffix |= itSuffix->second;
return true;
......
......@@ -11,8 +11,9 @@ namespace fs = boost::filesystem;
* @param outFile The file to write the database into
*/
void
write2file(const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database, const std::string &outFile)
write2file(std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned short > > &database, const std::string &outFile)
{
std::ofstream fout(outFile, std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
......@@ -28,6 +29,28 @@ write2file(const std::unordered_map<PrefixType, std::map<CodedSuffix, unsigned s
}
}
fout.close();
/*
std::ofstream fout(outFile, std::ios::out | std::ios::binary);
std::ofstream fout_index(outFile+".index", std::ios::out | std::ios::binary);
size_t val= database.size();
fout.write((char*)&val, sizeof(size_t));
for (auto it=database.begin(); it!=database.end(); ++it)
{
size_t val= it->second.size();
fout_index.write((char*)&it->first, sizeof(PrefixType));
auto x = fout.tellp();
fout_index.write((char*)&x, sizeof(std::streampos));
fout.write((char*)&val, sizeof(size_t));
for (auto it2=it->second.begin(); it2!=it->second.end(); ++it2)
{
fout.write((char*)&(it2->first), sizeof(CodedSuffix));
fout.write((char*)&(it2->second), sizeof(unsigned short));
}
}
fout.close();
fout_index.close();*/
}
......@@ -38,7 +61,7 @@ main(int argc, char const *argv[])
{
fs::path inFile, outFile;
size_t nThreads;
std::string radiantDBVersion = "0.9.1-beta";
std::string radiantDBVersion(std::string(STR(MAJOR_VERSION)) + "." + std::string(STR(MINOR_VERSION)) + "." + std::string(STR(PATCH_VERSION)) );
po::options_description allOpts("makeRadiantDB " + radiantDBVersion + " (C) 2017 Carsten Kemena\nThis program comes with ABSOLUTELY NO WARRANTY;\n\nAllowed options are displayed below.");
po::options_description general("General options");
general.add_options()
......
......@@ -14,9 +14,10 @@
// BSDL header
#include "../libs/BioSeqDataLib/src/external/Input.hpp"
#include "../libs/BioSeqDataLib/src/external/Output.hpp"
#include "../libs/BioSeqDataLib/src/utility/stringHelpers.hpp"
#include "common.hpp"
#include "version.hpp"
namespace po = boost::program_options;
namespace fs = boost::filesystem;
......@@ -119,9 +120,29 @@ cleanSuffixe(D &db)
db.erase(prevIt);
}
// if only one element left its most liklely not very useful and can be removed
if (db.size() == 1)
db.clear();
// test cleaning
if (db.size() > 1)
{
auto prevIt = db.begin();
auto nextIt = ++db.begin();
auto currIt = nextIt++;
while (nextIt != endIt)
{
if ((nextIt->second == prevIt->second) && (currIt->second == 0))
currIt = db.erase(currIt);
else
{
++currIt;
++prevIt;
}
++nextIt;
}
}
}
template<typename S, typename D>
......@@ -134,6 +155,8 @@ void splitSequence(S &seq, D &db, unsigned int windowSize, bool reverse)
bool last = false;
size_t k = 0;
size_t limit = (seq.size() >= windowSize) ? seq.size() - windowSize + 1: 0 ;
if (seq.size() > (windowSize*2))
limit = seq.size()/2;
while (k < limit)
{
if (last)
......@@ -183,32 +206,87 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
{
std::vector<std::pair<size_t,size_t> > families;
BSDL::SequenceSet<BSDL::Sequence<> > seqSet;
seqSet.read(inFile);
//Stockholm
//#=GF AC PF10417.8
//A0A0L0HRL4_SPIPN/171-210 ...........SLQLGDRR..KV..ATPADWtGK.................GDEV.IIHN.G.V.S.N.E.E.A..A------rlfpg.....................
/*
AlgorithmPack::Input sequences(inFile);
std::string line;
std::smatch m;
std::regex re ("PF([0-9]{5})\\.");
std::string comment = seqSet[0].comment();
std::regex_search (comment,m,re);
std::string accession = m[1].str();
seqSet[0].name(accession);
std::string accession = "";
std::string gap_chars = ".-";
size_t start = 0;
for (size_t i=1; i<seqSet.size(); ++i)
while (getline(sequences, line))
{
//std::cout << line << std::endl;
// line is alignment
if ((line[0] != '#') && (line[0] != '/'))
{
auto tokens = BSDL::split(line, " ");
for (auto gap_char : gap_chars)
tokens[1].erase (std::remove(tokens[1].begin(), tokens[1].end(), gap_char), tokens[1].end());
std::transform(tokens[1].begin(), tokens[1].end(),tokens[1].begin(), ::toupper);
seqSet.emplace_back(accession, tokens[1], "", "");
}
else
{
if (line.compare(0, 7, "#=GF AC") == 0)
{
std::regex_search (line, m, re);
accession = m[1].str();
if (seqSet.size() != 0)
{
families.emplace_back(start, seqSet.size());
start = seqSet.size();
}
}
}
}
families.emplace_back(start, seqSet.size());*/
// *** BEGIN FASTA ***///
// rename sequences
std::smatch m;
std::regex re ("PF([0-9]{5})\\.");
seqSet.read(inFile);
for (size_t i=0; i<seqSet.size(); ++i)
{
std::string comment = seqSet[i].comment();
std::regex_search (comment,m,re);
std::string tmp = m[1].str();
seqSet[i].name(tmp);
if (tmp != accession)
}
// sort sequnces just in case (PfamA.fasta should be sorted)
sort(seqSet.begin(), seqSet.end(),
[](const BSDL::Sequence<> & a, const BSDL::Sequence<> & b)
{
return a.name() < b.name();
}
);
std::string accession = seqSet[0].name();
size_t start = 0;
for (size_t i=1; i<seqSet.size(); ++i)
{
//std::string comment = seqSet[i].comment();
//std::regex_search (comment,m,re);
//std::string tmp = m[1].str();
//seqSet[i].name(tmp);
if (seqSet[i].name() != accession)
{
accession = std::move(tmp);
accession = seqSet[i].name();
families.emplace_back(start, i);
start = i;
}
}
families.emplace_back(start, seqSet.size());
// *** END FASTA ***///
std::cout << "Number of families: " << families.size() << std::endl;
const int windowSize = 18;
size_t famNumber = 0;
std::vector<D> threadDBs;
......@@ -295,61 +373,6 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
#pragma omp task firstprivate (it)
{
cleanSuffixe(it->second);
/*
auto it2 = it->second.begin();
auto it2End = it->second.end();
// remove multiple domain matching words
while (it2 != it2End)
{
if (it2->second == 0)
it2 = it->second.erase( it2 );
else
++it2;
}
if (it->second.size() > 1)
{
auto itCurr = it->second.begin();
auto itPrev = itCurr++;
while (itCurr != it2End)
{
if (itPrev->second != itCurr->second)
{
itPrev = it->second.erase(itPrev);
++itCurr;
}
else
break;
}
}
if (it->second.size() > 2)
{
auto itPrev = it->second.begin();
auto itCurr = itPrev;
++itCurr;
auto itNext = itCurr;
++itNext;
while (itNext != it2End)
{
if (((itCurr->second != itPrev->second) && (itPrev->second != itNext->second) && (itCurr->second != itNext->second)) || ((itCurr->second == itPrev->second) && (itCurr->second == itNext->second)))
itCurr = it->second.erase(itCurr);
else
{
++itCurr;
++itPrev;
}
++itNext;
}
//if (itPrev->second != itCurr->second)
// it->second.erase(itCurr);
}
if (it->second.size() == 1)
it->second.erase(it->second.begin());
*/
}
}
}
......@@ -368,7 +391,8 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
++it;
}
}
}
}
}
#endif
This diff is collapsed.
......@@ -14,7 +14,7 @@ BOOST_AUTO_TEST_SUITE(Encode_Test)