Mercurial > repos > fubar > egapx_runner
comparison nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf @ 0:d9c5c5b87fec draft
planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author | fubar |
---|---|
date | Sat, 03 Aug 2024 11:16:53 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:d9c5c5b87fec |
---|---|
1 #!/usr/bin/env nextflow | |
2 nextflow.enable.dsl=2 | |
3 | |
4 include { merge_params } from '../../utilities' | |
5 | |
6 | |
7 workflow gnomon_wnode { | |
8 take: | |
9 scaffolds | |
10 chains | |
11 chains_slices | |
12 hmm_params | |
13 softmask_lds2 | |
14 softmask_lds2_source | |
15 genome | |
16 proteins | |
17 parameters // Map : extra parameter and parameter update | |
18 main: | |
19 String gpx_qsubmit_params = merge_params("", parameters, 'gpx_qsubmit') | |
20 String annot_params = merge_params("-margin 1000 -mincont 1000 -minlen 225 -mpp 10.0 -ncsp 25 -window 200000 -nonconsens -open", parameters, 'annot_wnode') | |
21 String gpx_qdump_params = merge_params("-slices-for affinity -sort-by affinity", parameters, 'gpx_qdump') | |
22 | |
23 def (jobs, lines_per_file) = gpx_qsubmit(scaffolds, chains, chains_slices, gpx_qsubmit_params) | |
24 def annot_files = annot(jobs.flatten(), chains, hmm_params, softmask_lds2, softmask_lds2_source, genome, proteins, lines_per_file, annot_params) | |
25 gpx_qdump(annot_files.collect(), gpx_qdump_params) | |
26 emit: | |
27 outputs = gpx_qdump.out.outputs | |
28 } | |
29 | |
30 | |
31 process gpx_qsubmit { | |
32 input: | |
33 path scaffolds | |
34 path chains | |
35 path chains_slices | |
36 val params | |
37 output: | |
38 path "job.*" | |
39 env lines_per_file | |
40 script: | |
41 njobs=16 | |
42 """ | |
43 echo $scaffolds | tr ' ' '\\n' > scaffolds.mft | |
44 for file in $chains_slices; do | |
45 echo \$file >> chains_slices.mft | |
46 # remove path from the first line of this file | |
47 sed -i -e '1s/\\(.*\\)\\/\\(.*\\)\$/\\2/' \$file | |
48 done | |
49 gpx_qsubmit $params -ids-manifest scaffolds.mft -slices-manifest chains_slices.mft -o jobs | |
50 total_lines=\$(wc -l <jobs) | |
51 (( lines_per_file = (total_lines + ${njobs} - 1) / ${njobs} )) | |
52 echo total_lines=\$total_lines, lines_per_file=\$lines_per_file | |
53 # split -l\$lines_per_file jobs job. -da 3 | |
54 # Use round robin to distribute jobs across nodes more evenly | |
55 if [ \$total_lines -lt $njobs ]; then | |
56 effective_njobs=\$total_lines | |
57 else | |
58 effective_njobs=$njobs | |
59 fi | |
60 split -nr/\$effective_njobs jobs job. -da 3 | |
61 """ | |
62 stub: | |
63 njobs=16 | |
64 """ | |
65 for i in {1..$njobs}; do | |
66 echo j.\${i} >> jobs | |
67 done | |
68 split -nr/$njobs jobs job. -da 3 | |
69 lines_per_file=10 | |
70 """ | |
71 } | |
72 | |
73 | |
74 process annot { | |
75 input: | |
76 path jobs | |
77 path chains // used for staging chain files, referred from jobs | |
78 path hmm_params | |
79 path softmask_lds2 | |
80 path softmask | |
81 path genome, stageAs: 'indexed/*' | |
82 path proteins_asn, stageAs: 'indexed/*' | |
83 val lines_per_file | |
84 val params | |
85 output: | |
86 path "output/*" | |
87 script: | |
88 job_num = jobs.toString().tokenize('.').last().toInteger() | |
89 """ | |
90 njobs=`wc -l <$jobs` | |
91 if [ \$njobs -lt 16 ]; then | |
92 threads=\$njobs | |
93 else | |
94 threads=16 | |
95 fi | |
96 | |
97 lds2=indexed_lds | |
98 if [ -n "$softmask_lds2" ]; then | |
99 # patch LDS2 to point to the source | |
100 files=\$(sqlite3 $softmask_lds2 -cmd "SELECT file_name FROM file" ".exit") | |
101 for f in \$files; do | |
102 base=\$(basename \$f) | |
103 sqlite3 $softmask_lds2 -cmd "UPDATE file SET file_name = '\$base' WHERE file_name = '\$f'" ".exit" | |
104 done | |
105 lds2+=",$softmask_lds2" | |
106 elif [ -n "$softmask" ]; then | |
107 mkdir sm_src | |
108 mv $softmask ./sm_src/ | |
109 lds2_indexer -source ./sm_src/ -db softmask_lds2 | |
110 lds2+=",softmask_lds2" | |
111 fi | |
112 | |
113 filename=\$(basename -- "$jobs") | |
114 extension="\${filename##*.}" | |
115 (( start_job_id = ((10#\$extension) * $lines_per_file) + 1 )) | |
116 | |
117 # make the local LDS of the genomic fasta | |
118 lds2_indexer -source indexed -db indexed_lds | |
119 | |
120 # When running multiple jobs on the cluster there is a chance that | |
121 # several jobs will run on the same node and thus generate files | |
122 # with the same filename. We need to avoid that to be able to stage | |
123 # the output files for gpx_make_outputs. We add the job file numeric | |
124 # extension as a prefix to the filename. | |
125 mkdir interim | |
126 annot_wnode $params -nogenbank -lds2 \$lds2 -start-job-id \$start_job_id -workers \$threads -input-jobs $jobs -param $hmm_params -O interim || true | |
127 mkdir output | |
128 for f in interim/*; do | |
129 if [ -f \$f ]; then | |
130 mv \$f output/\${extension}_\$(basename \$f) | |
131 fi | |
132 done | |
133 """ | |
134 stub: | |
135 job_num = jobs.toString().tokenize('.').last().toInteger() | |
136 """ | |
137 mkdir -p output | |
138 touch output/sample_gnomon_wnode.${job_num}.out | |
139 """ | |
140 } | |
141 | |
142 | |
143 process gpx_qdump { | |
144 input: | |
145 path files, stageAs: "inputs/*" | |
146 val params | |
147 output: | |
148 path "*.out", emit: "outputs" | |
149 script: | |
150 """ | |
151 gpx_qdump $params -input-path inputs -output gnomon_wnode.out | |
152 """ | |
153 stub: | |
154 """ | |
155 touch gnomon_wnode.out | |
156 """ | |
157 } |