comparison nf/subworkflows/ncbi/gnomon/gnomon_wnode/main.nf @ 0:d9c5c5b87fec draft

planemo upload for repository https://github.com/ncbi/egapx commit 8173d01b08d9a91c9ec5f6cb50af346edc8020c4
author fubar
date Sat, 03 Aug 2024 11:16:53 +0000
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:d9c5c5b87fec
1 #!/usr/bin/env nextflow
2 nextflow.enable.dsl=2
3
4 include { merge_params } from '../../utilities'
5
6
7 workflow gnomon_wnode {
8 take:
9 scaffolds
10 chains
11 chains_slices
12 hmm_params
13 softmask_lds2
14 softmask_lds2_source
15 genome
16 proteins
17 parameters // Map : extra parameter and parameter update
18 main:
19 String gpx_qsubmit_params = merge_params("", parameters, 'gpx_qsubmit')
20 String annot_params = merge_params("-margin 1000 -mincont 1000 -minlen 225 -mpp 10.0 -ncsp 25 -window 200000 -nonconsens -open", parameters, 'annot_wnode')
21 String gpx_qdump_params = merge_params("-slices-for affinity -sort-by affinity", parameters, 'gpx_qdump')
22
23 def (jobs, lines_per_file) = gpx_qsubmit(scaffolds, chains, chains_slices, gpx_qsubmit_params)
24 def annot_files = annot(jobs.flatten(), chains, hmm_params, softmask_lds2, softmask_lds2_source, genome, proteins, lines_per_file, annot_params)
25 gpx_qdump(annot_files.collect(), gpx_qdump_params)
26 emit:
27 outputs = gpx_qdump.out.outputs
28 }
29
30
31 process gpx_qsubmit {
32 input:
33 path scaffolds
34 path chains
35 path chains_slices
36 val params
37 output:
38 path "job.*"
39 env lines_per_file
40 script:
41 njobs=16
42 """
43 echo $scaffolds | tr ' ' '\\n' > scaffolds.mft
44 for file in $chains_slices; do
45 echo \$file >> chains_slices.mft
46 # remove path from the first line of this file
47 sed -i -e '1s/\\(.*\\)\\/\\(.*\\)\$/\\2/' \$file
48 done
49 gpx_qsubmit $params -ids-manifest scaffolds.mft -slices-manifest chains_slices.mft -o jobs
50 total_lines=\$(wc -l <jobs)
51 (( lines_per_file = (total_lines + ${njobs} - 1) / ${njobs} ))
52 echo total_lines=\$total_lines, lines_per_file=\$lines_per_file
53 # split -l\$lines_per_file jobs job. -da 3
54 # Use round robin to distribute jobs across nodes more evenly
55 if [ \$total_lines -lt $njobs ]; then
56 effective_njobs=\$total_lines
57 else
58 effective_njobs=$njobs
59 fi
60 split -nr/\$effective_njobs jobs job. -da 3
61 """
62 stub:
63 njobs=16
64 """
65 for i in {1..$njobs}; do
66 echo j.\${i} >> jobs
67 done
68 split -nr/$njobs jobs job. -da 3
69 lines_per_file=10
70 """
71 }
72
73
74 process annot {
75 input:
76 path jobs
77 path chains // used for staging chain files, referred from jobs
78 path hmm_params
79 path softmask_lds2
80 path softmask
81 path genome, stageAs: 'indexed/*'
82 path proteins_asn, stageAs: 'indexed/*'
83 val lines_per_file
84 val params
85 output:
86 path "output/*"
87 script:
88 job_num = jobs.toString().tokenize('.').last().toInteger()
89 """
90 njobs=`wc -l <$jobs`
91 if [ \$njobs -lt 16 ]; then
92 threads=\$njobs
93 else
94 threads=16
95 fi
96
97 lds2=indexed_lds
98 if [ -n "$softmask_lds2" ]; then
99 # patch LDS2 to point to the source
100 files=\$(sqlite3 $softmask_lds2 -cmd "SELECT file_name FROM file" ".exit")
101 for f in \$files; do
102 base=\$(basename \$f)
103 sqlite3 $softmask_lds2 -cmd "UPDATE file SET file_name = '\$base' WHERE file_name = '\$f'" ".exit"
104 done
105 lds2+=",$softmask_lds2"
106 elif [ -n "$softmask" ]; then
107 mkdir sm_src
108 mv $softmask ./sm_src/
109 lds2_indexer -source ./sm_src/ -db softmask_lds2
110 lds2+=",softmask_lds2"
111 fi
112
113 filename=\$(basename -- "$jobs")
114 extension="\${filename##*.}"
115 (( start_job_id = ((10#\$extension) * $lines_per_file) + 1 ))
116
117 # make the local LDS of the genomic fasta
118 lds2_indexer -source indexed -db indexed_lds
119
120 # When running multiple jobs on the cluster there is a chance that
121 # several jobs will run on the same node and thus generate files
122 # with the same filename. We need to avoid that to be able to stage
123 # the output files for gpx_make_outputs. We add the job file numeric
124 # extension as a prefix to the filename.
125 mkdir interim
126 annot_wnode $params -nogenbank -lds2 \$lds2 -start-job-id \$start_job_id -workers \$threads -input-jobs $jobs -param $hmm_params -O interim || true
127 mkdir output
128 for f in interim/*; do
129 if [ -f \$f ]; then
130 mv \$f output/\${extension}_\$(basename \$f)
131 fi
132 done
133 """
134 stub:
135 job_num = jobs.toString().tokenize('.').last().toInteger()
136 """
137 mkdir -p output
138 touch output/sample_gnomon_wnode.${job_num}.out
139 """
140 }
141
142
143 process gpx_qdump {
144 input:
145 path files, stageAs: "inputs/*"
146 val params
147 output:
148 path "*.out", emit: "outputs"
149 script:
150 """
151 gpx_qdump $params -input-path inputs -output gnomon_wnode.out
152 """
153 stub:
154 """
155 touch gnomon_wnode.out
156 """
157 }