Commit 9cac7321 authored by Carsten Kemena's avatar Carsten Kemena

change to have better control

parent 2951cbd5
......@@ -24,11 +24,58 @@ namespace BSDL = BioSeqDataLib;
typedef uint64_t SuffixType;
typedef uint32_t PrefixType;
const short suffixLength = 12;
const short alphabetBitSize = 5;
constexpr SuffixType suffixShift = suffixLength * alphabetBitSize;
/**
* Convertion of an amino acid into a binary representation. Binary representation has been chosen
* to reflect evolutionary alphaber described in UproC:
* ATSPGNDEQKRHYWFMLIVC
*/
// aa 20
static const std::unordered_map<char, PrefixType> alphabet2bit20 =
{
{'A', 0}, {'T', 1}, {'S', 2}, {'P', 3}, {'G', 4}, {'N', 5},
{'D', 6}, {'E', 7}, {'Q', 8}, {'K', 9}, {'R', 10}, {'H', 11},
{'Y', 12}, {'W', 13}, {'F', 14}, {'M', 15}, {'L', 16}, {'I', 17},
{'V', 18}, {'C', 19}
};
// aa 15
static const std::unordered_map<char, PrefixType> alphabet2bit15 =
{
{'A', 0}, {'T', 1}, {'S', 2}, {'P', 3}, {'G', 4}, {'N', 5},
{'D', 6}, {'E', 7}, {'Q', 8}, {'K', 9}, {'R', 9}, {'H', 10},
{'Y', 11}, {'W', 12}, {'F', 11}, {'M', 13}, {'L', 13}, {'I', 13},
{'V', 13}, {'C', 14}
};
// aa 10
static const std::unordered_map<char, PrefixType> alphabet2bit10 =
{
{'A', 0}, {'T', 1}, {'S', 1}, {'P', 2}, {'G', 3}, {'N', 4},
{'D', 4}, {'E', 4}, {'Q', 4}, {'K', 5}, {'R', 5}, {'H', 6},
{'Y', 7}, {'W', 7}, {'F', 7}, {'M', 8}, {'L', 8}, {'I', 8},
{'V', 8}, {'C', 9}
};
// aa 8
static const std::unordered_map<char, PrefixType> alphabet2bit8 =
{
{'A', 0}, {'T', 1}, {'S', 1}, {'P', 2}, {'G', 0}, {'N', 3},
{'D', 3}, {'E', 3}, {'Q', 3}, {'K', 4}, {'R', 4}, {'H', 5},
{'Y', 6}, {'W', 6}, {'F', 6}, {'M', 7}, {'L', 7}, {'I', 7},
{'V', 7}, {'C', 7}
};
static const std::unordered_map<char, PrefixType> ALPHABET_2_BIT = alphabet2bit20;
const short ALPHABET_BIT_NUM = 5;
const short PREFIX_SIZE = 6;
const short SUFFIX_SIZE = 12;
constexpr short WORD_SIZE = PREFIX_SIZE + SUFFIX_SIZE;
constexpr PrefixType PREFIX_SHIFT = PREFIX_SIZE * ALPHABET_BIT_NUM;
constexpr SuffixType SUFFIX_SHIFT = SUFFIX_SIZE * ALPHABET_BIT_NUM;
struct CodedSuffix {
SuffixType suffix;
......@@ -83,53 +130,6 @@ inline bool operator<(const SuffixAcc &l, CodedSuffix r) {
}
/**
* Convertion of an amino acid into a binary representation. Binary representation has been chosen
* to reflect evolutionary alphaber described in UproC:
* ATSPGNDEQKRHYWFMLIVC
*/
// aa 20
static const std::unordered_map<char, PrefixType> alphabet2bit =
{
{'A', 0}, {'T', 1}, {'S', 2}, {'P', 3}, {'G', 4}, {'N', 5},
{'D', 6}, {'E', 7}, {'Q', 8}, {'K', 9}, {'R', 10}, {'H', 11},
{'Y', 12}, {'W', 13}, {'F', 14}, {'M', 15}, {'L', 16}, {'I', 17},
{'V', 18}, {'C', 19}
};
// aa 15
static const std::unordered_map<char, PrefixType> alphabet2bit15 =
{
{'A', 0}, {'T', 1}, {'S', 2}, {'P', 3}, {'G', 4}, {'N', 5},
{'D', 6}, {'E', 7}, {'Q', 8}, {'K', 9}, {'R', 9}, {'H', 10},
{'Y', 11}, {'W', 12}, {'F', 11}, {'M', 13}, {'L', 13}, {'I', 13},
{'V', 13}, {'C', 14}
};
// aa 10
static const std::unordered_map<char, PrefixType> alphabet2bit10 =
{
{'A', 0}, {'T', 1}, {'S', 1}, {'P', 2}, {'G', 3}, {'N', 4},
{'D', 4}, {'E', 4}, {'Q', 4}, {'K', 5}, {'R', 5}, {'H', 6},
{'Y', 7}, {'W', 7}, {'F', 7}, {'M', 8}, {'L', 8}, {'I', 8},
{'V', 8}, {'C', 9}
};
// aa 8
static const std::unordered_map<char, PrefixType> alphabet2bit8 =
{
{'A', 0}, {'T', 1}, {'S', 1}, {'P', 2}, {'G', 0}, {'N', 3},
{'D', 3}, {'E', 3}, {'Q', 3}, {'K', 4}, {'R', 4}, {'H', 5},
{'Y', 6}, {'W', 6}, {'F', 6}, {'M', 7}, {'L', 7}, {'I', 7},
{'V', 7}, {'C', 7}
};
/**
* @brief Computes the prefix and the suffix for a word in binary representation.
* @details Computes the prefix and suffix completely new without basing it on existing data;
......@@ -145,11 +145,11 @@ getCompletePrefixSuffix(const S &seq, size_t i, PrefixType &prefix, CodedSuffix
{
// get first prefix
prefix = 0;
auto itEnd = alphabet2bit.end();
for (size_t j=i; j<i+6; ++j)
auto itEnd = ALPHABET_2_BIT.end();
for (size_t j=i; j<i+PREFIX_SIZE; ++j)
{
prefix <<= 5;
auto it = alphabet2bit.find(seq[j]);
prefix <<= ALPHABET_BIT_NUM;
auto it = ALPHABET_2_BIT.find(seq[j]);
if (it != itEnd)
prefix |= it->second;
else
......@@ -159,10 +159,10 @@ getCompletePrefixSuffix(const S &seq, size_t i, PrefixType &prefix, CodedSuffix
// get first suffix
suffix.suffix=0;
for (size_t j=i+6; j<i+18; ++j)
for (size_t j=i+PREFIX_SIZE; j<i+WORD_SIZE; ++j)
{
suffix.suffix <<= 5;
auto it = alphabet2bit.find(seq[j]);
suffix.suffix <<= ALPHABET_BIT_NUM;
auto it = ALPHABET_2_BIT.find(seq[j]);
if (it != itEnd)
suffix.suffix |= it->second;
else
......@@ -187,20 +187,19 @@ bool
getNextPrefixSuffix(const S &seq, size_t i, PrefixType &prefix, CodedSuffix &suffix)
{
// check if new character is ok
auto itSuffix = alphabet2bit.find(seq[i+17]);
if (itSuffix == alphabet2bit.end())
auto itSuffix = ALPHABET_2_BIT.find(seq[i+WORD_SIZE-1]);
if (itSuffix == ALPHABET_2_BIT.end())
return false;
static const PrefixType clearPrefix = ~(PrefixType(3) << 30);
static const SuffixType clearSuffix = ~(SuffixType(15) << suffixShift);
static const PrefixType clearPrefix = ((uint64_t)-1 << PREFIX_SHIFT);
static const SuffixType clearSuffix = ((uint64_t)-1 << SUFFIX_SHIFT);
// update prefix
auto itPrefix = alphabet2bit.find(seq[i+5]);
prefix <<= 5;
auto itPrefix = ALPHABET_2_BIT.find(seq[i+PREFIX_SIZE-1]);
prefix <<= ALPHABET_BIT_NUM;
prefix &= clearPrefix;
prefix |= itPrefix->second;
// update suffix
suffix.suffix <<= 5;
suffix.suffix <<= ALPHABET_BIT_NUM;
suffix.suffix &= clearSuffix;
suffix.suffix |= itSuffix->second;
return true;
......
......@@ -232,7 +232,7 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
std::cout << "Number of families: " << families.size() << std::endl;
const int windowSize = 18;
//const int windowSize = 18;
size_t famNumber = 0;
std::vector<D> threadDBs;
......@@ -251,8 +251,8 @@ turnFile2db(fs::path &inFile, D &database, bool reverse)
{
// split sequence into words and store in database
auto &seq = seqSet[j];
if (seq.size() >= windowSize)
splitSequence(seq, dbTmp, windowSize, reverse);
if (seq.size() >= WORD_SIZE)
splitSequence(seq, dbTmp, WORD_SIZE, reverse);
}
merge2dbs(dbTmp, threadDB);
......
......@@ -190,7 +190,6 @@ assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet
size_t nSeqs = seqSet.size();
assignments.resize(nSeqs);
const size_t wordSize = 18;
PrefixType prefix;
CodedSuffix suffix;
......@@ -206,13 +205,13 @@ assignWords(const fs::path &inFile, BSDL::SequenceSet<BSDL::Sequence<> > &seqSet
std::reverse(seq.begin(), seq.end());
l = seq.size() -1;
}
if (seq.size() > wordSize)
if (seq.size() > WORD_SIZE)
{
auto &assignment = assignments[i];
size_t length = seq.size() - wordSize;
if (reverse && (length > (wordSize*2)))
length = wordSize * 2;
size_t length = seq.size() - WORD_SIZE;
/*if (reverse && (length > (WORD_SIZE*2)))
length = WORD_SIZE * 2;*/
bool last = false;
bool position;
for (size_t j=0; j<length; ++j)
......@@ -305,7 +304,6 @@ main(int argc, char const *argv[])
("detailed", po::value<fs::path>(&detailedFile), "The output file for the detailed results")
;
allOpts.add(general).add(outputO).add(translateO).add(hiddenO);
po::options_description visible("radiant " + radiantVersion + " (C) 2017 Carsten Kemena\nThis program comes with ABSOLUTELY NO WARRANTY;\n\nAllowed options are displayed below.");
visible.add(general).add(outputO).add(translateO);
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment