Commit 4317b464 authored by T-B-F's avatar T-B-F

clarify sh script for matrix generation

parent ef518471
......@@ -10,23 +10,26 @@ python 3.x
change the pfam version to the current one in the ftp
mkdir -p hhsuite_dbs/pfamA
mkdir -p hhsuite_dbs/pfamA
cd hhsuite_dbs/pfamA
wget http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pfamA_31.0.tgz .
tar xzf pfamA_31.0.tgz
cd hhsuite_dbs/pfamA
### Execute the make_domat.sh script
wget http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/pfamA_31.0.tgz .
compile mat2bincrs.cpp:
tar xzf pfamA_31.0.tgz
g++ mat2bincrs.cpp -o mat2bincrs
### Execute the make_domat.sh script
Execute make_domat.sh script:
chmod u+x make_domat.sh
./make_domat.sh hhsuite_dbs/pfamA/pfam tmpdir/
chmod u+x make_domat.sh
./make_domat.sh hhsuite_dbs/pfamA/pfam tmpdir/
It may take some time.
If you are an experienced user you can execute many of these steps in parallel using a computer grid.
If you are an experienced user you can execute many of these steps in parallel using a computer grid by changing the _make_domat.sh_ script
......
......@@ -42,8 +42,7 @@ do
done
# read hhsearch scores and make domain matrix
python make_matrix.py -i $tmphhsearch/*.scores -o $matrix -n matbin_$simcutoff -s PROBAB -t $simcutoff
python make_matrix2.py -i $tmphhsearch/*.scores -o $workout/tmp_matrix.dat -d PROBAB
# WARNING if this script is not working
# python make_matrix2.py -i $tmphhsearch/*.scores -o $workout/tmp_matrix.dat -d PROBAB -m ${workout}/map_name2id.dat
# mat2bincrs $workout/tmp_matrix.dat $matrix $simcutoff matbin_$simcutoff
\ No newline at end of file
# convert flatfile to binary matrix
mat2bincrs $workout/tmp_matrix.dat $matrix $simcutoff matbin_$simcutoff
\ No newline at end of file
......@@ -34,6 +34,7 @@ def getData(line, name):
sys.exit(1)
return int(floor(val+ 0.5))
def prefix_file(path):
""" get name of a file without extension out of a path
"""
......@@ -65,14 +66,14 @@ def main():
params = get_cmd()
# read mapping
mapping_name2id = read_mapping(params.mapping)
#mapping_name2id = read_mapping(params.mapping)
# pfam num id
# TODO need to be change for other databases
names, inputfiles = list(zip(*sorted([(os.path.basename(f).split(".")[0], f) for f in params.listres])))
# mapping pfamid, readl idx (some value can be missing in domain numbering
dnames = dict(zip(names, range(len(names))))
vals, allids, col_ids, row_ids = [], [], [], []
for row_num, inputf in enumerate(inputfiles):
name = names[row_num]
......@@ -84,7 +85,7 @@ def main():
for line in inf :
tmp = line.split()
if found:
target = tmp[0]
target = tmp[0].split(".")[0]
col_num = dnames[target]
if col_num >= row_num:
val = getData(line, params.scorename)
......
......@@ -52,11 +52,15 @@ def get_cmd():
""" read command line parameters
"""
parser = argparse.ArgumentParser( )
parser.add_argument("-i", action="store", dest="listres", help="directory to result", nargs="+")
parser.add_argument("-d", action="store", dest="dataname", help="data to get")
parser.add_argument("-m", action="store", dest="mapping", help="mapping between names and id")
parser.add_argument("-c", action="store", dest="cutoff_evalue", help="cutoff evalue", type=float, default=10**(-3))
parser.add_argument("-o", action="store", dest="outmatrix", help="output matrix")
parser.add_argument("-i", action="store", dest="listres",
help="directory to result", nargs="+")
parser.add_argument("-d", action="store", dest="dataname",
help="data to get")
#parser.add_argument("-m", action="store", dest="mapping", help="mapping between names and id")
parser.add_argument("-c", action="store", dest="cutoff_evalue",
help="cutoff evalue", type=float, default=10**(-3))
parser.add_argument("-o", action="store", dest="outmatrix",
help="output matrix")
params = parser.parse_args( )
return params
......@@ -65,12 +69,15 @@ def main():
params = get_cmd()
# read mapping
mapping_name2id = read_mapping(params.mapping)
#mapping_name2id = read_mapping(params.mapping)
# get result names
list_res = [(prefix_file(f), f) for f in params.listres]
names, inputfiles = zip(*sorted(list_res))
dnames = dict(zip(names, range(len(names)))
#names, inputfiles = zip(*sorted(list_res))
#dnames = dict(zip(names, range(len(names)))
names, inputfiles = list(zip(*sorted([(os.path.basename(f).split(".")[0], f) for f in params.listres])))
dnames = dict(zip(names, range(len(names))))
mat = np.zeros((len(names), len(names)), dtype=np.float32)
for i, inputf in enumerate(inputfiles):
......@@ -78,11 +85,13 @@ def main():
# read data
with open(inputf) as inf:
found = False
for line in f:
for line in inf:
if found:
tmp = line.split()
target = tmp[0][2:]
j = dnames[mapping_name2id[target]]
#target = tmp[0][2:]
#j = dnames[mapping_name2id[target]]
target = tmp[0].split(".")[0]
j = dnames[target]
val = getData(line, params.dataname)
evalue = getData(line, "LOG-EVAL")/(-1.443) # evalue formula in HHsearch output -1.443 * hit.logEval
if evalue < log(params.cutoff_evalue):
......
......@@ -35,7 +35,9 @@ main(int argc, char *argv[])
if (show_help || (argc!=5))
{
printf("USAGE: mat2bincrs.cpp <matrix file> <output file> <filter threshold> <name/identifier>\n\n");
printf("This program converts the input matrix into a binary sparse matrix in CRS (Compressed Row Storage) format.\n\n Output file description:\n Field_id name type number\n 1 name char 10\n 2 n_rows int 1\n 3 n_vals int 1\n 4 names int n_rows\n 5 row_ids int n_rows\n 6 col_ids int n_vals\n 7 vals values short n_vals\n");
printf("This program converts the input matrix into a binary sparse matrix in CRS (Compressed Row Storage) format.\n\n");
printf("Output file description:\n");
printf("Field_id name type number\n 1 name char 10\n 2 n_rows int 1\n 3 n_vals int 1\n 4 names int n_rows\n 5 row_ids int n_rows\n 6 col_ids int n_vals\n 7 vals values short n_vals\n");
if (show_help)
exit(EXIT_SUCCESS);
else
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment