Commit 7d971f8b authored by Carsten Kemena's avatar Carsten Kemena

adding sw to AlignmentMatrix

- added Smith-Waterman algorithm
- added test for Smith-Waterman algorithm
- added test for RASPODOM
parent e2eb35f5
......@@ -71,6 +71,9 @@ private:
void
traceback_raspodom_(std::vector<long int> &editString1, std::vector<long int> &editString2);
void
traceback_sw_(std::vector<long int> &editString1, std::vector<long int> &editString2);
using MatchTracker = std::set<std::pair<size_t, size_t> >;
bool
traceback_raspodom_optimal_path_(MatchTracker &matches_tl, MatchTracker &matches_tr,
......@@ -101,6 +104,11 @@ public:
void
gotoh(const SeqType &seq1, const SeqType &seq2);
template<typename SeqType>
void
sw(const SeqType &seq1, const SeqType &seq2);
template<typename SeqType>
void
raspodom(const SeqType &seq1, const SeqType &seq2);
......@@ -120,6 +128,9 @@ public:
case 2:
traceback_raspodom_(editString1, editString2);
break;
case 3:
traceback_sw_(editString1, editString2);
break;
}
}
......@@ -784,6 +795,137 @@ AlignmentMatrix<DataType, SimMat>::traceback_raspodom_(std::vector<long int> &ed
isCP_ = true;
}
/***************************************************
* SW - Algorithm *
***************************************************/
template<typename DataType, typename SimMat>
template<typename SeqType>
void
AlignmentMatrix<DataType, SimMat>::sw(const SeqType &seq1, const SeqType &seq2)
{
algorithm_ = 3;
dim1_ = seq1.size();
dim2_ = seq2.size();
size_t j;
matrix_.resize(dim1_+1, dim2_+1);
for (size_t i=0; i<dim1_; ++i)
{
for (j=0; j<dim2_; ++j)
matrix_[i+1][j+1].first = simMat_.val(seq1[i],seq2[j]);
}
size_t dim2 = dim2_ + 1;
auto itPrev = matrix_[0].begin();
auto itVert = matrix_[0].begin();
auto it = matrix_[0].begin();
auto itEnd = matrix_[0].begin()+dim2;
char path;
size_t i;
DataType score;
for (; it!=itEnd; ++it)
{
it->first=0;
it->second='o';
}
for (i=1; i <= dim1_; ++i)
{
itPrev = it = matrix_[i].begin();
itEnd = matrix_[i].begin()+dim2;
itVert = matrix_[i-1].begin();
it->first = 0;
it->second = 'o';
++it;
for (; it!=itEnd; ++it, ++itPrev)
{
score=it->first+itVert->first;
path=itVert->second;
++itVert;
// check gaps;
if (itVert->first > itPrev->first)
{
it->first = itVert->first;
it->second = 'i';
}
else
{
it->first = itPrev->first;
it->second = 'j';
}
it->first += gep_;
// check match
if ((score > it->first) || ((score == it->first) && (path=='m')))
{
it->first = score;
it->second = 'm';
}
if (it->first <= 0)
{
it->first = 0;
it->second = 'o';
}
}
}
// set best score
score_ = 0;
for (size_t i = 1; i <= dim1_; ++i)
{
for (size_t j = 1; j <= dim2_; ++j)
{
if (matrix_[i][j].first > score_)
{
score_ = matrix_[i][j].first;
best_score_x_ = i;
best_score_y_ = j;
}
}
}
}
template<typename DataType, typename SimMat>
void
AlignmentMatrix<DataType, SimMat>::traceback_sw_(std::vector<long int> &editString1, std::vector<long int> &editString2)
{
size_t dim1 = best_score_x_;
size_t dim2 = best_score_y_;
editString1.clear();
editString2.clear();
while (matrix_[dim1][dim2].second != 'o')
{
if (matrix_[dim1][dim2].second == 'm')
{
--dim1;
--dim2;
editString1.push_back(dim1);
editString2.push_back(dim2);
}
else
{
if (matrix_[dim1][dim2].second == 'i')
{
--dim1;
editString1.push_back(dim1);
editString2.push_back(-1);
}
else
{
--dim2;
editString1.push_back(-1);
editString2.push_back(dim2);
}
}
}
std::reverse(editString1.begin(), editString1.end());
std::reverse(editString2.begin(), editString2.end());
}
}
#endif /* AlignmentMatrix_hpp */
......@@ -88,7 +88,7 @@ namespace BioSeqDataLib
* @param simMat The similarity matrix.
* @param matrix The dynamic programming matrix.
*/
template<typename SeqType, typename SimMat, typename DataType>
template<typename SeqType, typename SimMat, typename DataType>
void
initNwMatrix(const SeqType &seq1, const SeqType &seq2, const SimMat &simMat, LineMatrix<std::pair<DataType, char> > &matrix)
{
......
......@@ -199,8 +199,28 @@ BOOST_AUTO_TEST_CASE( Gotoh_alignDomain_Test2 )
BOOST_CHECK_EQUAL(mat.score(), 145.0);
}
BOOST_AUTO_TEST_CASE(sw_align_test )
{
BioSeqDataLib::Sequence<> seq1("seq1", "WWWWWWWWWWQGPYELSDTLQAPVLNDEWGTEAVFELLSNAVWWWWWWWWWWWWW", "", "test sequence");
BioSeqDataLib::Sequence<> seq2("seq2", "PPPPPPPPPPPQGPYELSDDTNQAPVLNDEGTEAVFELLSNAVPPPPPPPPPPPP", "", "test sequence");
BioSeqDataLib::SimilarityMatrix<float> simMat("../tests/align/data/BLOSUM62.txt");
BioSeqDataLib::Matrix< std::pair<float,char> > matrix(1,2);
BioSeqDataLib::AlignmentMatrix<float, BioSeqDataLib::SimilarityMatrix<float> > mat(-3, simMat);
mat.sw(seq1, seq2);
std::vector<long int> eS1, eS2;
mat.traceback(eS1, eS2);
std::vector<long int> expected1 { 10, 11, 12, 13, 14, 15, 16, 17, -1, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41};
std::vector<long int> expected2 { 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, -1, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42};
BOOST_CHECK_EQUAL_COLLECTIONS(eS1.begin(), eS1.end(), expected1.begin(), expected1.end());
BOOST_CHECK_EQUAL_COLLECTIONS(eS2.begin(), eS2.end(), expected2.begin(), expected2.end());
}
BOOST_AUTO_TEST_CASE(raspodom_align_domain_test )
{
BioSeqDataLib::Settings settings;
settings.readSettings();
fs::path matrixName = settings["dsm"] / "pfam-31.dsm";
......@@ -219,10 +239,66 @@ BOOST_AUTO_TEST_CASE(raspodom_align_domain_test )
seq2.push_back(dom);
dom.accession("PF00001");
seq2.push_back(dom);
BioSeqDataLib::AlignmentMatrix<int, BioSeqDataLib::DSM> mat(-50, -10, simMat);
mat.raspodom(seq1, seq2);
std::vector<long int> eS1, eS2;
mat.traceback(eS1, eS2);
std::vector<long int> expected1 { 0, 1, 2};
std::vector<long int> expected2 { 2, 0, 1};
BOOST_CHECK_EQUAL_COLLECTIONS(eS1.begin(), eS1.end(), expected1.begin(), expected1.end());
BOOST_CHECK_EQUAL_COLLECTIONS(eS2.begin(), eS2.end(), expected2.begin(), expected2.end());
BOOST_CHECK_EQUAL(mat.isCP(), true);
seq1.clear();
seq2.clear();
dom.accession("PF00001");
seq1.push_back(dom);
dom.accession("PF00002");
seq1.push_back(dom);
seq2.push_back(dom);
dom.accession("PF00003");
seq1.push_back(dom);
seq2.push_back(dom);
dom.accession("PF00004");
seq1.push_back(dom);
dom.accession("PF00001");
seq2.push_back(dom);
mat.raspodom(seq1, seq2);
mat.traceback(eS1, eS2);
expected1 = { 3, 0, 1, 2};
expected2 = { -1, 2, 0, 1};
BOOST_CHECK_EQUAL_COLLECTIONS(eS1.begin(), eS1.end(), expected1.begin(), expected1.end());
BOOST_CHECK_EQUAL_COLLECTIONS(eS2.begin(), eS2.end(), expected2.begin(), expected2.end());
BOOST_CHECK_EQUAL(mat.isCP(), true);
seq1.clear();
seq2.clear();
dom.accession("PF00001");
seq1.push_back(dom);
dom.accession("PF00002");
seq1.push_back(dom);
seq2.push_back(dom);
dom.accession("PF00003");
seq1.push_back(dom);
seq2.push_back(dom);
dom.accession("PF00004");
seq2.push_back(dom);
dom.accession("PF00001");
seq2.push_back(dom);
mat.raspodom(seq1, seq2);
mat.traceback(eS1, eS2);
expected1 = { -1, 0, 1, 2};
expected2 = { 2, 3, 0, 1};
BOOST_CHECK_EQUAL_COLLECTIONS(eS1.begin(), eS1.end(), expected1.begin(), expected1.end());
BOOST_CHECK_EQUAL_COLLECTIONS(eS2.begin(), eS2.end(), expected2.begin(), expected2.end());
BOOST_CHECK_EQUAL(mat.isCP(), true);
// seq2.push_back(dom);
......@@ -248,8 +324,7 @@ BOOST_AUTO_TEST_CASE(raspodom_align_domain_test )
dom.accession("PF00003");
//seq2.push_back(dom);
*/
BioSeqDataLib::AlignmentMatrix<int, BioSeqDataLib::DSM> mat(-50, -10, simMat);
/*BioSeqDataLib::AlignmentMatrix<int, BioSeqDataLib::DSM> mat(-50, -10, simMat);
mat.raspodom(seq1, seq2);
std::vector<long int> eS1, eS2;
mat.traceback(eS1, eS2);
......@@ -261,7 +336,7 @@ BOOST_AUTO_TEST_CASE(raspodom_align_domain_test )
std::cout << elem << " ";
std::cout << "\n";
std::cout << mat.isCP() << "\n";
std::cout << mat.isCP() << "\n";*/
//std::vector<long int> expected1 { 0, 1, 2};
//std::vector<long int> expected2 { -1, 0, 1};
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment