Commit 9b167203 authored by Carsten Kemena's avatar Carsten Kemena

changed DASetInputStrategies to be an object

parent 946aafb7
......@@ -33,11 +33,6 @@
* @copyright Copyright (c) 2019
*
*
* Any output strategy should follow the following pattern:
* - Identify if it can handle the provided format: if not return false
* - if an error occured while reading throw a format exception
* - if everything goes through, return true
*
*/
#ifndef BSDL_DOMAIN_DASETINPUTSTRATEGY_HPP
......@@ -50,77 +45,141 @@
namespace BioSeqDataLib
{
/**
* @brief Function to read the pfam_scan
* @brief Interface class for DASetInput strategies
*
* @tparam DomainType The domain type
* @param in The file to read from
* @param daSet The DomainArrangementSet to add the file content to
* This class is an abstract class describing the functions needed by DASetReader.
*
* \relates DASetReader
* @tparam DomainType The domain type.
*/
template<typename DomainType>
class DASetInputStrategy
{
public:
/**
* @brief String denoting the supported format(s)
*
* The idea is to use this so that the application can automatically create a list of supported formats.
*
* @return std::vector<std::string> The format
*/
virtual std::vector<std::string> formats() const = 0;
/**
* @brief Check if this strategie can handle the provided file format.
*
* @param inFile The file to read
* @return true The file format is correct for this strategy.
* @return false This file format is not supported by the given strategy.
*/
virtual bool checkFormat(AlgorithmPack::Input &inFile) const = 0;
/**
* @brief Reads the provides file
*
* @param inFile The file to read
* @param fileName The file name of the inFile parmeter. Only to be used for exception throwing.
* @return DomainArrangementSet<DomainType> The DomainArrangementSet containig all domains from the file.
*/
virtual DomainArrangementSet<DomainType> readFile(AlgorithmPack::Input &inFile, const std::string &fileName) const = 0;
};
/**
* @brief PfamScan format reader class
*
* @tparam DomainType The domainType to be used.
*/
template<class DomainType>
bool
readPfamFormat(AlgorithmPack::Input &inFile, const std::string &fileName, DomainArrangementSet<DomainType> &daSet)
template<typename DomainType>
class PfamReader final : public DASetInputStrategy<DomainType>
{
std::string line;
getline(inFile, line);
if ((line.find("pfam_scan.pl") == std::string::npos) && (line.find("CICADA") == std::string::npos))
return false;
int line_counter = 0;
std::string last_name = "";
typename DomainArrangementSet<DomainType>::iterator da;
try
public:
std::vector<std::string> formats() const
{
while (getline(inFile, line))
return { "PfamScan" };
}
bool
checkFormat(AlgorithmPack::Input &inFile) const
{
std::string line;
inFile.seekg(0);
getline(inFile, line);
if ((line.find("pfam_scan.pl") == std::string::npos) && (line.find("CICADA") == std::string::npos))
return false;
return true;
}
DomainArrangementSet<DomainType>
readFile(AlgorithmPack::Input &inFile, const std::string &fileName) const
{
DomainArrangementSet<DomainType> daSet;
std::string line;
inFile.seekg(0);
getline(inFile, line);
int line_counter = 0;
std::string last_name = "";
typename DomainArrangementSet<DomainType>::iterator da;
try
{
++line_counter;
if (line.empty() || (line[0] == '#'))
continue;
auto tokens = split(line, " ");
if (tokens[0] != last_name)
while (getline(inFile, line))
{
last_name = tokens[0];
da = daSet.emplace(last_name, DomainArrangement<DomainType>()).first;
}
++line_counter;
if (line.empty() || (line[0] == '#'))
continue;
auto tokens = split(line, " ");
if (tokens[0] != last_name)
{
last_name = tokens[0];
da = daSet.emplace(last_name, DomainArrangement<DomainType>()).first;
}
/*
* pfam_scan.pl format
* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
* # <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
*1kjz_A 5 193 3 194 PF00009.22 GTP_EFTU Domain 3 187 188 145.5 1.2e-42 1 CL0023
*/
std::string accession;
size_t pos;
if ((pos=tokens[5].find('.')) == std::string::npos)
{
accession=tokens[5];
}
else
{
accession=tokens[5].substr(0, pos);
}
/*
* pfam_scan.pl format
* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
* # <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
*1kjz_A 5 193 3 194 PF00009.22 GTP_EFTU Domain 3 187 188 145.5 1.2e-42 1 CL0023
*/
std::string accession;
size_t pos;
if ((pos=tokens[5].find('.')) == std::string::npos)
{
accession=tokens[5];
}
else
{
accession=tokens[5].substr(0, pos);
}
PfamDomain dom(accession, tokens[6], std::stoul(tokens[1])-1, stoul(tokens[2])-1, stoul(tokens[3])-1, stoul(tokens[4])-1, stoul(tokens[8])-1, stoul(tokens[9])-1, stoul(tokens[10]), stod(tokens[11]), stod(tokens[12]), stod(tokens[13]), (tokens[14] == "No_clan") ? "" : tokens[14], tokens[7]);
da->second.emplace_back(std::move(dom));
PfamDomain dom(accession, tokens[6], std::stoul(tokens[1])-1, stoul(tokens[2])-1, stoul(tokens[3])-1, stoul(tokens[4])-1, stoul(tokens[8])-1, stoul(tokens[9])-1, stoul(tokens[10]), stod(tokens[11]), stod(tokens[12]), stod(tokens[13]), (tokens[14] == "No_clan") ? "" : tokens[14], tokens[7]);
da->second.emplace_back(std::move(dom));
}
}
catch(...)
{
throw FormatException("Error: Failed to read domain annotation file '" + fileName + "'. Format of file: PfamScan. Error occured in line: " + std::to_string(line_counter) + "\n");
}
for (auto &arrangement : daSet)
{
sort(arrangement.second.begin(), arrangement.second.end());
}
return daSet;
}
catch(...)
{
throw FormatException("Error: Failed to read domain annotation file '" + fileName + "'. Format of file: PfamScan. Error occured in line: " + std::to_string(line_counter) + "\n");
}
};
for (auto &arrangement : daSet)
{
sort(arrangement.second.begin(), arrangement.second.end());
}
return true;
}
}
......
......@@ -60,7 +60,7 @@ template<class DomainType>
class DASetReader
{
private:
std::vector<std::function<bool(AlgorithmPack::Input &, const std::string &, DomainArrangementSet<DomainType> &)> > strategies;
std::vector<DASetInputStrategy<DomainType> *> strategies_;
public:
/**
......@@ -84,18 +84,18 @@ class DASetReader
* @param daSet The arrangment set to read the domains into.
* @param fileName The input file name
*/
void
read(const fs::path &fileName, DomainArrangementSet<DomainType> &daSet)
DomainArrangementSet<DomainType>
read(const fs::path &fileName)
{
//DomainArrangementSet<DomainType> daSet;
//std::cout << "TEST-R: " << &daSet << "\n";
AlgorithmPack::Input inF(fileName);
for (size_t i=0; i<strategies.size(); ++i)
for (size_t i=0; i<strategies_.size(); ++i)
{
inF.seekg(0);
if (strategies[i](inF, fileName.string(), daSet))
if (strategies_[i]->checkFormat(inF))
{
inF.close();
return;
return strategies_[i]->readFile(inF, fileName.string());
}
}
inF.close();
......@@ -109,11 +109,17 @@ class DASetReader
* @param strategy The function containing the strategy.
*/
void
addStrategy(std::function<bool(AlgorithmPack::Input &, const std::string &, DomainArrangementSet<DomainType> &)> strategy)
addStrategy(DASetInputStrategy<DomainType> *strategy)
{
strategies.emplace_back(strategy);
strategies_.push_back(strategy);
}
void
loadAllSupportedStrategies()
{
}
};
......
......@@ -372,7 +372,7 @@ public:
* @return An iterator to the begin of the set.
*/
iterator
begin()
begin() noexcept
{
return arrangements_.begin();
}
......@@ -383,7 +383,7 @@ public:
* @return An iterator to the begin of the set.
*/
const_iterator
begin() const
begin() const noexcept
{
return arrangements_.begin();
}
......@@ -393,7 +393,7 @@ public:
* @return An iterator to the end of the set.
*/
iterator
end()
end() noexcept
{
return arrangements_.end();
}
......@@ -404,7 +404,7 @@ public:
* @return An iterator to the end of the set.
*/
const_iterator
end() const
end() const noexcept
{
return arrangements_.end();
}
......@@ -414,7 +414,7 @@ public:
* @return Iterator to the reverse beginning.
*/
reverse_iterator
rbegin()
rbegin() noexcept
{
return arrangements_.rbegin();
}
......@@ -424,7 +424,7 @@ public:
* @return An iterator to the reverse begin of the set.
*/
const_reverse_iterator
rbegin() const
rbegin() const noexcept
{
return arrangements_.rbegin();
}
......@@ -434,7 +434,7 @@ public:
* @return An iterator to the reverse end of the set.
*/
reverse_iterator
rend()
rend() noexcept
{
return arrangements_.rend();
}
......@@ -445,7 +445,7 @@ public:
* @return An iterator to the reverse end of the set.
*/
const_reverse_iterator
rend() const
rend() const noexcept
{
return arrangements_.rend();
}
......
......@@ -14,11 +14,13 @@ namespace BSDL = BioSeqDataLib;
BOOST_AUTO_TEST_CASE( pfamScanFormatTest )
{
BSDL::DomainArrangementSet<BSDL::PfamDomain> daSet;
AlgorithmPack::Input in("../tests/domain/data_new/BB20012.pfamScan");
auto worked = BSDL::readPfamFormat(in, "x", daSet);
BSDL::PfamReader<BSDL::PfamDomain> reader;
BOOST_CHECK_EQUAL(reader.checkFormat(in), true);
daSet = reader.readFile(in, "x");
in.close();
BOOST_CHECK_EQUAL(worked, true);
BOOST_CHECK_EQUAL(daSet.size(), 27);
auto da = daSet["1kjz_A"];
BOOST_CHECK_EQUAL(da[0].accession(), "PF00009");
......@@ -61,11 +63,11 @@ BOOST_AUTO_TEST_CASE( pfamScanFormatTest )
// check read with Domain
BioSeqDataLib::DomainArrangementSet<BioSeqDataLib::Domain> daSet2;
BSDL::PfamReader<BSDL::Domain> reader2;
in.open("../tests/domain/data_new/BB20012.pfamScan");
worked = BSDL::readPfamFormat(in, "x", daSet2);
BioSeqDataLib::DomainArrangementSet<BioSeqDataLib::Domain> daSet2 = reader2.readFile(in, "x");
in.close();
BOOST_CHECK_EQUAL(worked, true);
BOOST_CHECK_EQUAL(daSet2.size(), 27);
const BioSeqDataLib::DomainArrangement<BioSeqDataLib::Domain> &set2 = daSet2["IF2G_ARCFU"];
BOOST_CHECK_EQUAL(set2.size(), 3);
......@@ -76,9 +78,9 @@ BOOST_AUTO_TEST_CASE( pfamScanFormatTest )
BOOST_CHECK_CLOSE(dom2.evalue(), 0.0000000073, 0.000001);
// check read with DomainExt
BioSeqDataLib::DomainArrangementSet<BioSeqDataLib::DomainExt> daSet3;
BSDL::PfamReader<BSDL::DomainExt> reader3;
in.open("../tests/domain/data_new/BB20012.pfamScan");
worked = BSDL::readPfamFormat(in, "x", daSet3);
BioSeqDataLib::DomainArrangementSet<BioSeqDataLib::DomainExt> daSet3 = reader3.readFile(in, "x");
in.close();
BOOST_CHECK_EQUAL(daSet3.size(), 27);
const BioSeqDataLib::DomainArrangement<BioSeqDataLib::DomainExt> &set3 = daSet3["IF2G_ARCFU"];
......@@ -97,12 +99,9 @@ BOOST_AUTO_TEST_CASE( pfamScanFormatTest )
BOOST_CHECK_CLOSE(dom3.bitscore(), 35.6, 0.01);
// check format fail
BioSeqDataLib::DomainArrangementSet<BioSeqDataLib::DomainExt> daSet4;
in.open("../tests/domain/data_new/test.fasta");
worked = BSDL::readPfamFormat(in, "x", daSet4);
BOOST_CHECK_EQUAL(reader3.checkFormat(in), false);
in.close();
BOOST_CHECK_EQUAL(worked, false);
BOOST_CHECK_EQUAL(daSet4.size(), 0);
}
BOOST_AUTO_TEST_SUITE_END()
......
......@@ -9,11 +9,15 @@
#include <optional>
#include <utility>
BOOST_AUTO_TEST_SUITE(DASEr_Test)
namespace BSDL = BioSeqDataLib;
bool isUnsupported(BioSeqDataLib::FormatException const& ex )
{
return ( 0 == strcmp(ex.what(), "Error: Format of file '../tests/domain/data_new/test.fasta' could not be identified or is not supported.\n"));
......@@ -28,17 +32,15 @@ bool formatIsBroken(BioSeqDataLib::FormatException const& ex)
BOOST_AUTO_TEST_CASE( readPfam )
{
BSDL::DomainArrangementSet<BSDL::PfamDomain> daSet;
//BSDL::DomainArrangementSet<BSDL::PfamDomain> daSet;
BSDL::DASetReader<BSDL::PfamDomain> reader;
std::function<bool(AlgorithmPack::Input &, const std::string &, BSDL::DomainArrangementSet<BSDL::PfamDomain> &)> f = BSDL::readPfamFormat<BSDL::PfamDomain>;
reader.addStrategy(f);
reader.read("../tests/domain/data_new/BB20012.pfamScan", daSet);
reader.addStrategy(new BSDL::PfamReader<BSDL::PfamDomain>());
auto daSet = reader.read("../tests/domain/data_new/BB20012.pfamScan");
BOOST_CHECK_EQUAL(daSet.size(), 27);
BOOST_CHECK_THROW(reader.read("../tests/domain/data_new/NOTEXISTENT", daSet), std::ios_base::failure);
BOOST_CHECK_EXCEPTION(reader.read("../tests/domain/data_new/BB20012.pfamScan_formatError", daSet) , BioSeqDataLib::FormatException, formatIsBroken);
BOOST_CHECK_EXCEPTION(reader.read("../tests/domain/data_new/test.fasta", daSet) , BioSeqDataLib::FormatException, isUnsupported);
BOOST_CHECK_THROW(reader.read("../tests/domain/data_new/NOTEXISTENT"), std::ios_base::failure);
BOOST_CHECK_EXCEPTION(reader.read("../tests/domain/data_new/BB20012.pfamScan_formatError") , BioSeqDataLib::FormatException, formatIsBroken);
BOOST_CHECK_EXCEPTION(reader.read("../tests/domain/data_new/test.fasta") , BioSeqDataLib::FormatException, isUnsupported);
}
BOOST_AUTO_TEST_SUITE_END()
......
......@@ -27,8 +27,7 @@
# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
1kjz_A 5 193 3 194 PF00009.22 GTP_EFTU Domain 3 187 188 145.5 1.2e-42 1 CL0023
1kjz_A 312 398 312 398 PF09173.6
1kjz_A 221 302 221 303 PF03144.20 GTP_EFTU_D2 Domain 1 73 74 38.6 8.3e-10 1 CL0023
1kjz_A DKLS 302 221 303 PF03144.20 GTP_EFTU_D2 Domain 1 73 74 38.6 8.3e-10 1 CL0023
IF2G_METTH 6 198 4 199 PF00009.22 GTP_EFTU Domain 3 187 188 149.3 8.1e-44 1 CL0023
IF2G_METTH 230 298 230 310 PF03144.20 GTP_EFTU_D2 Domain 1 62 74 34.2 2e-08 1 CL0023
IF2G_METTH 324 407 322 407 PF09173.6 eIF2_C Domain 4 88 88 111.1 1.9e-32 1 No_clan
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment