Commit 479797f2 authored by Carsten Kemena's avatar Carsten Kemena

added multi domain file test and errors

parent 689617a7
......@@ -241,7 +241,7 @@ DBCreator::readAnnotationFile(const fs::path &annotationFile, const fs::path &se
{
seqLength = to_string(seqLengths[pair.first]);
if (seqLength == "0")
cerr << "WARNING! " << pair.first << " not in sequence file. Protein length will be set to 0.\n";
throw std::runtime_error("WARNING! " + pair.first + " not in sequence file.\n");
}
this->addDomainArrangement_(pair.second, pair.first, seqLength);
}
......
......@@ -112,6 +112,18 @@ main(int argc, char *argv[])
return EXIT_FAILURE;
}
// check for domain annotation files
if ((daFiles.size() == 0) and (seqFiles.size() != 0))
cerr << "WARNING! Sequence file is not needed when using InterPro. Sequence file is ignored.\n";
else
{
if ((daFiles.size() > 0) and (!seqFiles.empty()) and (seqFiles.size() != daFiles.size()))
{
cerr << "Error! If you provide a sequence file, you have to provide one for each annotation file.";
exit(1);
}
}
try
{
DBCreator db;
......
# pfam_scan.pl, run at Fri Dec 2 10:10:58 2016
#
# Copyright (c) 2009 Genome Research Ltd
# Freely distributed under the GNU
# General Public License
#
# Authors: Jaina Mistry (jm14@sanger.ac.uk), John Tate (jt6@sanger.ac.uk),
# Rob Finn (rdf@sanger.ac.uk)
#
# This is free software; you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation; either version 2 of the License, or (at your option) any later version.
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program. If not, see <http://www.gnu.org/licenses/>.
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
# query sequence file: db_seqs.fa
# searching against: /global/databases/pfam/v30.0//Pfam-A.hmm, with cut off --cut_ga
# resolve clan overlaps: on
# predict active sites: off
# = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
#
# <seq id> <alignment start> <alignment end> <envelope start> <envelope end> <hmm acc> <hmm name> <type> <hmm start> <hmm end> <hmm length> <bit score> <E-value> <significance> <clan>
A0A011 20 276 11 277 PF00664.21 ABC_membrane Family 12 273 274 29.9 3.7e-07 1 CL0241
A0A011 361 504 360 506 PF00004.25 ABC_tran Domain 2 135 137 80.1 1.9e-22 1 CL0023
A0A017 3 105 1 105 PF02310.17 B12-binding Domain 19 121 121 49.3 3.9e-13 1 CL0063
A0A017 164 324 163 325 PF04055.19 Radical_SAM Domain 2 166 167 89.0 3.8e-25 1 CL0036
A0A012 19 279 8 279 PF00664.21 ABC_membrane Family 12 274 274 102.4 2.9e-29 1 CL0241
A0A012 340 489 340 489 PF00005.25 ABC_tran Domain 1 137 137 93.3 1.6e-26 1 CL0023
A0A013 15 249 15 249 PF01370.19 Epimerase Family 1 241 241 177.3 3.1e-52 1 CL0063
A0A010 41 381 37 381 PF00155.19 Aminotran_1_2 Domain 6 363 363 206.8 4.9e-61 1 CL0061
A0A014 49 162 48 163 PF13537.4 GATase_7 Domain 2 123 124 41.0 1.4e-10 1 CL0052
A0A014 240 626 239 627 PF00733.19 Asn_synthase Domain 2 354 355 277.1 2.9e-82 1 CL0039
A0A019 10 63 2 69 PF02543.13 Carbam_trans_N Domain 11 66 338 33.8 2.6e-08 1 CL0108
A0A019 104 312 86 313 PF02543.13 Carbam_trans_N Domain 123 337 338 62.5 4.9e-17 1 CL0108
A0A019 362 524 361 526 PF16861.3 Carbam_trans_C Domain 2 168 170 151.4 1.5e-44 1 No_clan
>A0A011
MLRGSARTYWTLTGLWVLLRAGTLVVGLLFQRLFDALGAGGGVWLIIALVAAIEAGRLFL
QFGVMINRLEPRVQYGTTARLRHALLGSALRGSEVTARTSPGESLRTVGEDVDETGFFVA
WAPTNLAHWLFVAASVTVMMRIDAVVTGALLALLVLLTLVTALAHSRFLRHRRATRAASG
EVAGALREMVGAVGAVQAAAAEPQVAAHVAGLNGARAEAAVREELYAVVQRTVIGNPAPI
GVGVVLLLVAGRMDEGTFSVGDLALFAFYLQILTEALGSIGMLSVRLQRVSVALGRITNN
LGCRLRRSLERASPPIASDAPGGTGEGAAAPDAGPEPAPPLRELAVRGLTARHPGAGHGI
EDVDLVVERHTVTVVTGRVGSGKSTLVRAVLGLLPHERGTVLWNGEPIADPASFLVAPRC
GYTPQVPCLFSGTVRENVLLGRDGAAFDEAVRLAVAEPDLAAMQDGPDTVVGPRGLRLSG
GQIQRVAIARMLVGDPELVVLDDVSSALDPETEHLLWERLLDGTRTVLAVSHRPALLRAA
DRVVVLEGGRVEASGTFEEVMAVSAEMGRIWTGAGPGGGDAGPAPQSPPAG
>A0A017
MGYIHTALKSAGFHHVIQVDTPALGLDSEGLRKLLADFEPDLVGVSTTTPGLPGAIEACE
AAKSTGAKVILGGPHTEVYAHENLVHESIDYVGVGEGVTIMPELAEAMERGEEPEGIRGL
VTRKHDGGAAPMVNLEEVGWPERAGLPMDRYYSIMAPRPFATMISSRGCPFKCSFCFKQA
VDKKSMYRSPEDVVGEMTELKERWGVKEIMFYDDVFTLHRGRVREICGLIGETGLKVRWE
APTRVDLVPEPLLEAMAGAGCVRLRFGIEHGDSEILERMRKESDIQKIEKAVTSAHEAGI
KGFGYFIVGWLGETREQFRRTVDLACRLPLDYASFYTATPLPGTPLHTESVAAGQIPPDY
WDRFSCGASSTRGSGTWCRTRRSAPSGRTAPSSCAAPWSSRCCRTWR
>A0A012
MRGERTAVALLALLVPAGMGLQLVAPYLLRGFIDGALSGDSRKTLLDLAAWSLAAAVGTL
VVTAGTEALSSRVAWRSTNRLRADLVEHCLSRPPGFYRKHPPGELVERMDGDVTRLAAVM
STLLLELLAQALLIVGILVALFRLEWRLALVVAPFAAGTLLLLRTLVGRAMPFVTARQRV
AADLQGFLEERLAAAEDLRVNGASRYTLRELGDRQDDLYRKARDAARASVRWPATVQGLS
AVSVVLALAVSAWLHARGQLSTGTAFASLSYAMLLRRPLLAVTTRFRELEDAAASAQRLR
DLLGHGTAAPRTGRGTLPAGLPGVRFDGVSFGYEPDEPVLRDVSFTLRPGERLGVVGRTG
SGKSTVVRLLFGLHHPGAGSVSAGGLDLTEIDPRALRSRVALVTQEVHVFHASLRDNLTF
FDRSVPDDRLRAALGEAGLGPWLRTLPDGLDTPLGAGARGMSAGEEQQLALARVFLRDPG
LVLMDEPTARLDPYSERLLMPALERLLEGRTAVVVEHRPHLLRNVDRILVLEEGKVAEEG
ERRVLAADPGSRFHALLRTAGATR
>A0A013
MSSDTHGTDLADGDVLVTGAAGFIGSHLVTELRNSGRNVVAVDRRPLPDDLESTSPPFTG
SLREIRGDLNSLNLVDCLKNISTVFHLAALPGVRPSWTQFPEYLRCNVLATQRLMEACVQ
AGVERVVVASSSSVYGGADGVMSEDDLPRPLSPYGVTKLAAERLALAFAARGDAELSVGA
LRFFTVYGPGQRPDMFISRLIRATLRGEPVEIYGDGTQLRDFTHVSDVVRALMLTASVRD
RGSAVLNIGTGSAVSVNEVVSMTAELTGLRPCTAYGSARIGDVRSTTADVRQAQSVLGFT
ARTGLREGLATQIEWTRRSLSGAEQDTVPVGGSSVSVPRL
>A0A010
MDFFVRLARETGDRKREFLELGRKAGRFPAASTSNGEISIWCSNDYLGMGQHPDVLDAMK
RSVDEYGGGSGGSRNTGGTNHFHVALEREPAEPHGKEDAVLFTSGYSANEGSLSVLAGAV
DDCQVFSDSANHASIIDGLRHSGARKHVFRHKDGRHLEELLAAADRDKPKFIALESVHSM
RGDIALLAEIAGLAKRYGAVTFLDEVHAVGMYGPGGAGIAARDGVHCEFTVVMGTLAKAF
GMTGGYVAGPAVLMDAVRARARSFVFTTALPPAVAAGALAAVRHLRGSDEERRRPAENAR
LTHGLLRERDIPVLSDRSPIVPVLVGEDRMCKRMSALPLERHGAYVQAIDAPSVPAGEEI
LRIAPSAVHETEEIHRFVDALDGIWSELGAARRV
>A0A014
MCGFVGFSDAGAGQEDARVTAERMLAAVAHRGPDGSDWCHHRGVTLAHCALTFTDPDHGA
QPFVSASGATAVVFNGELYNHAVLGDGALPCAPGGDTEVPGGTLRVAGHADARPAAGHVR
LRAAGRPHRHHGAGRDRWGRAPLLTPACETDIAFASELTSLLRHPAAPRTPEVRALADYL
VLQAFCAPASAVSGVCKVRPGSYVTHRHGALDETEFWRPRLTPDRGAGRGPGRREAARRF
EELFRAAVARRMTSTDRRLGVLLSGGLDSSAVAAVAQQLLPGRPVPTFSAGFADPDFDES
DHARAVARHLGTEHHVVRIGGADLAGVVESELAVADEPLADPSLLPTRLVCRAAREHVRG
VLTGDGADELLLGYRYFQAERAIELLLRVLPAPRLEALVRLLVRRLPARSGNLPVTHALG
LLAKGLRAAPEHRFYLSTAPFGPGELPRLLTPEAGAELTGHDPFTEVSRLLRGQPGLTGV
QRSQLAVVTHFLRDVILTKTDRGGMRSSLELRSPFLDLDLVEYGNSLPTGLKLHRFTGKY
LLRQVAAGWLPPSVVQRTKLGFRAPVAALLRGELRPLLLDTLSPSSLRRGGLFDTGAVRL
LIDDHLGGRRDTSRKLWALLVYQLWFESLTAGPRALESPAYPALS
>A0A019
MKVLSLHSAGHDTGVAYFEDGRLVFAVETERLTRVKHDHRSDVALRHVLEQECVDTDGID
LVAVSTPVRSGLLRIPDLDRAMERIGAGALHHRTVCEMLGRRVECVVVTHEVSHAALAAH
YADWEEGTVVLVNEGRGQLTRSSLFRVTGGALEWVDKDPLPWYGNGFGWTAIGYLLGFGP
SPSVAGKVMAMGGYGQPDPRIREQLLSVDPEVMNDRELAERVRADLAGRPEFAPGFETAS
QVVATFQEMFTEAVRAVLDRHVTRTDAGVGPIALGGGCALNIVANSALREEYGRDVAIPP
ACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAV
AGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMR
DERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEEL
SGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR
>A0A010
MKVLSLHSAGHDTGVAYFEDGRLVFAVETERLTRVKHDHRSDVALRHVLEQECVDTDGID
LVAVSTPVRSGLLRIPDLDRAMERIGAGALHHRTVCEMLGRRVECVVVTHEVSHAALAAH
YADWEEGTVVLVNEGRGQLTRSSLFRVTGGALEWVDKDPLPWYGNGFGWTAIGYLLGFGP
SPSVAGKVMAMGGYGQPDPRIREQLLSVDPEVMNDRELAERVRADLAGRPEFAPGFETAS
QVVATFQEMFTEAVRAVLDRHVTRTDAGVGPIALGGGCALNIVANSALREEYGRDVAIPP
ACGDAGHLTGAGLYALAQVAGVKPEPFSVYRNGGGEARAAVLEAVEGAGLRAVPYDRSAV
AGVLAGGGVVALTQGAAELGPRALGHRSLLGSPAVPGMRERMSEKLKRREWFRPLGAVMR
DERFAGLYPGRAPSPYMLFEYRLPDGIAPEARHVNGTCRIQTLGPEEDRLYGLLAEFEEL
SGVPALINTSLNGPGKPIAHTARDVLDDFARTDVDLFVFDDLMVRGAAAR
>A0A016
MTVRRPAASAPRVLLTAGPDGVRVEGDGEARLGHPLTGDHLDPGPPAEGVFAGWRWDGER
LVARNDRYGVCPLFYRAGGGSLALSPDPLALLPEDGPVELDHDALAVFLRTGFFLAEDTA
FAQVRALPPAATLTWDTGGLRLRSDGPPRPGAAAMTEAQAVDGFVDLFRASVARRLPGEP
YDLPLSGGRDSRHILLELCRRGAPPRRCVSGAKFPPDPGADARVAAALAGRLGLPHTVVP
RPRSQFRAELAALPAQGMTTLDGAWTQPVLAHLRRHSRISYDGLGGGELVQNPSVEFIRA
NPYDPADLPGLADRLLAASRTGPHVEHLLSPRTNALWSRQAARRRLVTELARHADSASPL
SSFFFWNRTRRSISAAPFALGDGRVLTHTPYLDHALFDHLASVPHRFLVDGTFHDRALHR
AFPEHADLGFASSVPQRHGPVLVAHRLAYLLRFLAHATVVEPGWWRGPDRFLQRLLAAGR
GPGAPQRVSRLQPLALYLLQLEDLAVRRARRRP
>A0A015
MAAPDRPLVQVLSPRTWGEFGNYLAATRFSRALRSVIDAEVTLLEAEPILPWIGEAGAQI
RTISLESPDAVVRNQRYMALMDRLQARFPEGFEADPTAAQRADLEPLTRHLRESAPDVVV
GTKGFVARLCVAAVRLAGTSTRVVSHVTNPGLLQLPLHRSRYPDLTLVGFPRAKEHLLAT
AGGDPERVQVVGPLVAQHDLRDFMTSETAVSEAGPWGGDSGPDRPRVIIFSNRGGDTYPE
LVRRLADRHPGIDLVFVGYGDPELARRTAAVGRPHWRFHSVLGQSEYFDYIRRASRSRYG
LLVSKAGPNTTLEAAYFGIPVLMLESGLPMERWVPGLIHEEGLGHACATPEELFRTADDW
LTRPSVIEVHKKAAVSFAASVLDQDAVTARIKAALQPLLDAR
# RADS version 2.3.0
# RADS Output v1
# run at Fri Jun 29 16:25:28 2018
#
# query file: -
# database: /local/home/ckeme_01/projects/domainWorld/RADS/tests/integrationTests/multi_pfam
# gap open penalty -50
# gap extension penalty -10
# matrix: /local/home/ckeme_01/.domainWorld/dsm/pfam-31.dsm
# all: false
# collapse: true
# ******************************************************************
# -------------------------------------------------------------------
Results for: manual entered query
Domain arrangement: PF00005
# score | normalized | SeqID | sequence length | domain arrangement | aln
# -------------------------------------------------------------------
90 0.52 A0A001 591 PF00664 20 276 PF00005 361 504 1
90 0.52 A0A002 564 PF00664 19 279 PF00005 340 489 1
90 0.52 A0A012 564 PF00664 19 279 PF00005 340 489 1
# -------------------------------------------------------------------
List of alignments:
# -------------------------------------------------------------------
1)
Query DA: ******* PF00005
Target DA: PF00664 PF00005
......@@ -91,8 +91,7 @@
@test "rads - InterPro order" {
# database based on pfam annotation files
run ../../build/makeRadsDB -I ../data/sort-test.xml -s ../data/db_seqs.fa -o ip_order -d PFAM
run ../../build/makeRadsDB -I ../data/sort-test.xml -o ip_order -d PFAM
[ $status == 0 ]
echo $output
[ "$output" == $'Number of sequences included: 3\nNumber of distinct arrangements 2' ]
......@@ -108,8 +107,7 @@
@test "rads - query collapse" {
# database based on pfam annotation files
run ../../build/makeRadsDB -I ../data/sort-test.xml -s ../data/db_seqs.fa -o ip_order -d PFAM
run ../../build/makeRadsDB -I ../data/sort-test.xml -o ip_order -d PFAM
[ $status == 0 ]
echo $output
[ "$output" == $'Number of sequences included: 3\nNumber of distinct arrangements 2' ]
......@@ -121,3 +119,18 @@
rm ip_order.db ip_order.da test-collapse.txt
}
@test "rads - multi annotation files" {
run ../../build/makeRadsDB ../../build/makeRadsDB -i ../data/db_pfam.dom ../data/db_pfam3.dom -s ../data/db_seqs.fa ../data/db_seqs3.fa -o multi_pfam
[ $status == 0 ]
echo $output
[ "$output" == $'Number of sequences included: 15\nNumber of distinct arrangements 8' ]
run ../../build/rads -D PF00005 -m pfam-31.dsm -d multi_pfam -o test-multi.txt -c -l
run diff <(grep -v '#' test-multi.txt) <(grep -v '#' results/test-multi.txt)
[ $status == 0 ]
rm multi_pfam.db multi_pfam.da test-multi.txt
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment